[SCM] ViennaCL packaging branch, master, updated. debian/1.1.2-6-32-g333e537

Michael Wild themiwi at users.sourceforge.net
Fri Mar 23 09:47:59 UTC 2012


The following commit has been merged in the master branch:
commit e13cb126b963315e3f791c7fa7323b91fb81b204
Author: Michael Wild <themiwi at users.sourceforge.net>
Date:   Fri Mar 23 08:58:39 2012 +0100

    New upstream version 1.2.1

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b1be9f..ffe5f6a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,8 @@ endif(COMMAND cmake_policy)
 
 project(ViennaCL)
 
+SET(VIENNACL_SRC_DIST ON)
+
 
 #
 # User customizations if CMake does not find Boost or OpenCL
@@ -29,7 +31,7 @@ ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 
 set(VERSION_MAJOR 1)
 set(VERSION_MINOR 2)
-set(VERSION_PATCH 0)
+set(VERSION_PATCH 1)
 set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/README b/README
index 46e10bc..a7cf5f9 100644
--- a/README
+++ b/README
@@ -26,9 +26,9 @@ ViennaCL requires the following:
 The first step is to extract the file:
 
 Unix-based OS:
-$> gunzip ViennaCL-1.1.2.tar.gz
-$> tar -xf ViennaCL-1.1.2.tar
-$> cd ViennaCL-1.1.2
+$> gunzip ViennaCL-1.2.1.tar.gz
+$> tar -xf ViennaCL-1.2.1.tar
+$> cd ViennaCL-1.2.1
 
 ViennaCL is a header-only library, therefore it is sufficient to copy the subfolder viennacl/ (holding the header files) into you project directory or your system include directory. For instructions on how to set the include paths correctly, please refer to the documentation of your compiler.
 
@@ -54,10 +54,12 @@ Unix-based clients:
 
 ---- ViennaCL-1.X.X
    |
-   |-- auxiliary/ - Auxiliary Files (i.e. the OpenCL source code tree and the converter for the header files)
+   |-- auxiliary/ - Auxiliary files (i.e. the OpenCL source code tree and the converter for the header files)
    | 
    |-- CL/ - The OpenCL headers
    |
+   |-- cmake/ - Additional CMake configuration files
+   |
    |-- doc/ - Documentation (LaTeX and doxygen)
    | 
    |-- examples/ - Tutorial and benchmarking applications
@@ -77,12 +79,12 @@ Unix-based clients:
    |-- viennacl/ - The library source code
 
 
-4. Authors and Contact
+4. Contact
 ------------------------
 
-Florian Rudolf (flo.rudy+viennacl at gmail.com)
-Karl Rupp (rupp at iue.tuwien.ac.at    <--- primary contact)
-Josef Weinbub (weinbub at iue.tuwien.ac.at)
+For any technical questions related to ViennaCL, please use our mailing list: viennacl-support at lists.sourceforge.net
+You may also use the forum provided by sourceforge.net: http://sourceforge.net/projects/viennacl/
+For any other issues, please contact the project head Karl Rupp  at rupp at iue.tuwien.ac.at.
 
 ViennaCL was developed under the aegis of the 'Institute for Microelectronics' at the 'Vienna University of Technology'.
 
diff --git a/auxiliary/CMakeLists.txt b/auxiliary/CMakeLists.txt
index cfd1ce6..d009a3f 100644
--- a/auxiliary/CMakeLists.txt
+++ b/auxiliary/CMakeLists.txt
@@ -100,6 +100,7 @@ set(COORDINATE_MATRIX_SRCS
 
 set(MATRIX_COL_SRCS
    matrix_col/align1/add.cl
+   matrix_col/align1/assign.cl
    matrix_col/align1/clear.cl
    matrix_col/align1/cpu_inplace_mult.cl
    matrix_col/align1/fft_direct.cl
@@ -128,6 +129,7 @@ set(MATRIX_COL_SRCS
 
 set(MATRIX_ROW_SRCS
    matrix_row/align1/add.cl
+   matrix_row/align1/assign.cl
    matrix_row/align1/clear.cl
    matrix_row/align1/cpu_inplace_mult.cl
    matrix_row/align1/fft_direct.cl
diff --git a/auxiliary/converter.cpp b/auxiliary/converter.cpp
index a11de53..d858822 100644
--- a/auxiliary/converter.cpp
+++ b/auxiliary/converter.cpp
@@ -189,7 +189,7 @@ void writeKernelInit(std::ostream & kernel_file, const char * dirname, std::stri
       kernel_file << "      viennacl::ocl::DOUBLE_PRECISION_CHECKER<double>::apply();" << std::endl;
     kernel_file << "      static std::map<cl_context, bool> init_done;" << std::endl;
     kernel_file << "      viennacl::ocl::context & context_ = viennacl::ocl::current_context();" << std::endl;
-    kernel_file << "      if (!init_done[context_.handle()])" << std::endl;
+    kernel_file << "      if (!init_done[context_.handle().get()])" << std::endl;
     kernel_file << "      {" << std::endl;
     kernel_file << "        std::string source;" << std::endl;
     if (!is_float)
@@ -248,7 +248,7 @@ void writeKernelInit(std::ostream & kernel_file, const char * dirname, std::stri
         }
     } //for                
     
-    kernel_file << "        init_done[context_.handle()] = true;" << std::endl;
+    kernel_file << "        init_done[context_.handle().get()] = true;" << std::endl;
     kernel_file << "       } //if" << std::endl;
     kernel_file << "     } //init" << std::endl;
     kernel_file << "    }; // struct" << std::endl << std::endl;
diff --git a/auxiliary/generate-blas3-prod-align1.cpp b/auxiliary/generate-blas3-prod-align1.cpp
index d30effa..3223c24 100755
--- a/auxiliary/generate-blas3-prod-align1.cpp
+++ b/auxiliary/generate-blas3-prod-align1.cpp
@@ -75,7 +75,8 @@ void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major
   std::cout << "          __local float * bufA," << std::endl;
   std::cout << "          __local float * bufB) " << std::endl;
   std::cout << "{ " << std::endl;
-  std::cout << "  size_t block_size = get_local_size(0);" << std::endl;
+  //do not forgot to change block_size !!!
+  std::cout << "  size_t block_size = 16;//get_local_size(0);" << std::endl;
   std::cout << "  size_t row_block_id = get_group_id(0);" << std::endl;
   std::cout << "  size_t col_block_id = get_group_id(1);" << std::endl;
   std::cout << "  size_t row_thread_id = get_local_id(0);" << std::endl;
@@ -128,40 +129,28 @@ void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major
 
 
   if (transpose_A)
-  {
-    std::cout << "  size_t block_num = A_row_size / block_size;" << std::endl;
-    std::cout << "  if (block_num * block_size != A_row_size)" << std::endl;
-    std::cout << "    ++block_num;" << std::endl;
-  }
+    std::cout << "  size_t block_num = (A_row_size + block_size - 1) / block_size;" << std::endl;
   else
-  {
-    std::cout << "  size_t block_num = A_col_size / block_size;" << std::endl;
-    std::cout << "  if (block_num * block_size != A_col_size)" << std::endl;
-    std::cout << "    ++block_num;" << std::endl;
-  }
+    std::cout << "  size_t block_num = (A_col_size + block_size - 1) / block_size;" << std::endl;
     
   std::cout << "  float Csub = 0;" << std::endl;
   
   //offset of the the memory access by the thread relative to the beginning of the block:
-  if (row_major_A && transpose_A)
+  if (row_major_A)
     std::cout << "  size_t aOffset = row_thread_id + col_thread_id * A_internal_cols;" << std::endl;
-  else if (row_major_A && !transpose_A)
-    std::cout << "  size_t aOffset = row_thread_id * A_internal_cols + col_thread_id;" << std::endl;
-  else if (!row_major_A && transpose_A)
-    std::cout << "  size_t aOffset = row_thread_id * A_internal_rows + col_thread_id;" << std::endl;
-  else if (!row_major_A && !transpose_A)
+  else
     std::cout << "  size_t aOffset = row_thread_id + col_thread_id * A_internal_rows;" << std::endl;
 
-  if (row_major_B && transpose_B)
+  if (row_major_B)
     std::cout << "  size_t bOffset = row_thread_id + col_thread_id * B_internal_cols;" << std::endl;
-  else if (row_major_B && !transpose_B)
-    std::cout << "  size_t bOffset = row_thread_id * B_internal_cols + col_thread_id;" << std::endl;
-  else if (!row_major_B && transpose_B)
-    std::cout << "  size_t bOffset = row_thread_id * B_internal_rows + col_thread_id;" << std::endl;
-  else if (!row_major_B && !transpose_B)
+  else
     std::cout << "  size_t bOffset = row_thread_id + col_thread_id * B_internal_rows;" << std::endl;
+
+  std::cout << std::endl;  
   
-  std::cout << "  size_t row_thread_id_times_block_size = row_thread_id * block_size;" << std::endl;
+  std::cout << "  size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);" << std::endl;
+  std::cout << "  size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);" << std::endl;
+
   std::cout << "  for (size_t block = 0;" << std::endl;
   std::cout << "           block < block_num;" << std::endl;
   std::cout << "           ++block)" << std::endl;
@@ -178,43 +167,39 @@ void printMatrixMatrixProduct(bool row_major_A, bool row_major_B, bool row_major
   std::cout << "      bufA[row_thread_id * block_size + col_thread_id] = 0;" << std::endl;*/
 
   //new code:
-  if (transpose_A)
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_row_size && get_global_id(0) < A_col_size) ? A[aBegin + aOffset] : 0;" << std::endl;
-  else 
-    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_col_size && get_global_id(0) < A_row_size) ? A[aBegin + aOffset] : 0;" << std::endl;
-  
-  
-  
-  //read block from B and check for access within matrix:
-  /*if (transpose_B)
-    std::cout << "    if ( (block * block_size + row_thread_id < B_cols) && get_global_id(1) < B_rows )" << std::endl;
-  else 
-    std::cout << "    if ( (block * block_size + row_thread_id < B_rows) && get_global_id(1) < B_cols )" << std::endl;
-  
-  std::cout << "      bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; " << std::endl;
-  std::cout << "    else" << std::endl;
-  std::cout << "      bufB[row_thread_id * block_size + col_thread_id] = 0;" << std::endl;*/
-  
-  /*if (transpose_B)
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else 
-    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;" << std::endl;*/
-  if (transpose_B)
-    std::cout << "    bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;" << std::endl;
-  else 
-    std::cout << "    bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;" << std::endl;
-  
-  
+  if (transpose_A && row_major_A)
+    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
+  else if (transpose_A && !row_major_A)
+    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
+  else if (!transpose_A && row_major_A)
+    std::cout << "    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
+  else if (!transpose_A && !row_major_A)
+    std::cout << "    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;" << std::endl;
+
+
+  if (transpose_B && row_major_B)
+    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
+  else if (transpose_B && !row_major_B)
+    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
+  else if (!transpose_B && row_major_B)
+    std::cout << "    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
+  else if (!transpose_B && !row_major_B)
+    std::cout << "    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;" << std::endl;
+
   //computation of block-matrix-matrix product is the same for all cases:
   std::cout << "    barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
   //std::cout << "    for (size_t k = 0; k < block_size; ++k)" << std::endl;
   //std::cout << "      Csub += bufA[row_thread_id_times_block_size + k] * bufB[k * block_size + col_thread_id];" << std::endl;
   //loop unrolling:
-  std::cout << "__local float * bufAptr = bufA + row_thread_id_times_block_size;" << std::endl;
-  std::cout << "__local float * bufBptr = bufB + col_thread_id * block_size;" << std::endl;
+  std::cout << "    __local float * bufAptr = bufA + row_thread_id_times_block_size;" << std::endl;
+  std::cout << "    __local float * bufBptr = bufB + col_thread_id_times_block_size;" << std::endl;
   //std::cout << "      Csub += bufA[row_thread_id_times_block_size] * bufB[col_thread_id * block_size];" << std::endl;
-  for (size_t i=0; i<15; ++i)
+  // code in following line depends on block size and must be changed in case of block_size changes
+  std::cout << "      for(int i = 0; i < 4; i++) {" << std::endl;
+  for (size_t unroll = 0; unroll < 4; ++unroll) {
     std::cout << "      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;" << std::endl;
+  }
+  std::cout << "     }" << std::endl;
     //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "  + col_thread_id * block_size];" << std::endl;
     //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << " * block_size + col_thread_id];" << std::endl;
     //std::cout << "      Csub += bufAptr[" << i << "] * bufB[" << i << "];" << std::endl;
diff --git a/auxiliary/matrix_col/align1/add.cl b/auxiliary/matrix_col/align1/add.cl
index 49bdd83..2a09d6f 100644
--- a/auxiliary/matrix_col/align1/add.cl
+++ b/auxiliary/matrix_col/align1/add.cl
@@ -1,11 +1,29 @@
 
-__kernel void add(
-          __global const float * vec1,
-          __global const float * vec2, 
-          __global float * result,
-          unsigned int size) 
+__kernel void add(  // C = A + B
+          __global const float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          __global float * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    result[i] = vec1[i] + vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      C[i + C_row_start + (j + C_col_start) * C_internal_rows] =  A[i + A_row_start + (j + A_col_start) * A_internal_rows]
+                                                                  + B[i + B_row_start + (j + B_col_start) * B_internal_rows];
 }
-
diff --git a/auxiliary/matrix_row/align1/inplace_add.cl b/auxiliary/matrix_col/align1/assign.cl
similarity index 61%
copy from auxiliary/matrix_row/align1/inplace_add.cl
copy to auxiliary/matrix_col/align1/assign.cl
index 08b7f7f..55a678f 100644
--- a/auxiliary/matrix_row/align1/inplace_add.cl
+++ b/auxiliary/matrix_col/align1/assign.cl
@@ -1,5 +1,5 @@
 
-__kernel void inplace_add(
+__kernel void assign( // A <- B
           __global float * A,
           unsigned int A_row_start,
           unsigned int A_col_start,
@@ -15,12 +15,8 @@ __kernel void inplace_add(
           unsigned int B_internal_rows,
           unsigned int B_internal_cols)
 { 
-  if (   get_global_id(0) < A_row_size
-      && get_global_id(1) < A_col_size
-     )
-    A[  (get_global_id(0) + A_row_start) * A_internal_cols
-      + (get_global_id(1) + A_col_start)] 
-      += B[  (get_global_id(0) + B_row_start) * B_internal_cols
-           + (get_global_id(1) + B_col_start)];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] = B[i + B_row_start + (j + B_col_start) * B_internal_rows];
 }
 
diff --git a/auxiliary/matrix_col/align1/clear.cl b/auxiliary/matrix_col/align1/clear.cl
index d3a6e48..bb11f6a 100644
--- a/auxiliary/matrix_col/align1/clear.cl
+++ b/auxiliary/matrix_col/align1/clear.cl
@@ -1,9 +1,14 @@
 
-__kernel void clear(
-          __global float * vec,
-          unsigned int size) 
+__kernel void clear( // A <- 0
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] = 0;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] = 0;
 }
-
diff --git a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl b/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
index 0b0eeec..833b51b 100644
--- a/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
+++ b/auxiliary/matrix_col/align1/cpu_inplace_mult.cl
@@ -1,10 +1,16 @@
 
-__kernel void cpu_inplace_mult(
-          __global float * vec,
-          float factor, 
-          unsigned int size) 
+__kernel void cpu_inplace_mult( // A *= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          float factor) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] *= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] *= factor;
 }
 
diff --git a/auxiliary/matrix_col/align1/inplace_add.cl b/auxiliary/matrix_col/align1/inplace_add.cl
index a7c4fca..db947d0 100644
--- a/auxiliary/matrix_col/align1/inplace_add.cl
+++ b/auxiliary/matrix_col/align1/inplace_add.cl
@@ -1,5 +1,5 @@
 
-__kernel void inplace_add(
+__kernel void inplace_add( // A += B
           __global float * A,
           unsigned int A_row_start,
           unsigned int A_col_start,
@@ -15,11 +15,7 @@ __kernel void inplace_add(
           unsigned int B_internal_rows,
           unsigned int B_internal_cols)
 { 
-  if (   get_global_id(0) < A_row_size
-      && get_global_id(1) < A_col_size
-     )
-    A[  (get_global_id(0) + A_row_start)
-      + (get_global_id(1) + A_col_start) * A_internal_rows] 
-      += B[  (get_global_id(0) + B_row_start)
-           + (get_global_id(1) + B_col_start) * B_internal_rows];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] += B[i + B_row_start + (j + B_col_start) * B_internal_rows];
 }
diff --git a/auxiliary/matrix_col/align1/inplace_divide.cl b/auxiliary/matrix_col/align1/inplace_divide.cl
index 69ee2a8..640a0dc 100644
--- a/auxiliary/matrix_col/align1/inplace_divide.cl
+++ b/auxiliary/matrix_col/align1/inplace_divide.cl
@@ -1,11 +1,17 @@
 
-__kernel void inplace_divide(
-          __global float * vec,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          unsigned int size) 
+__kernel void inplace_divide( // A /= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * fac) //note: CPU variant is mapped to prod_scalar
 { 
   float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] /= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] /= factor;
 }
 
diff --git a/auxiliary/matrix_col/align1/inplace_mult.cl b/auxiliary/matrix_col/align1/inplace_mult.cl
index 6112c6d..9c24038 100644
--- a/auxiliary/matrix_col/align1/inplace_mult.cl
+++ b/auxiliary/matrix_col/align1/inplace_mult.cl
@@ -1,12 +1,18 @@
 
-__kernel void inplace_mult(
-          __global float * vec,
-          __global const float * fac, 
-          unsigned int size) 
+__kernel void inplace_mult( // A *= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * fac) 
 { 
   float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] *= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] *= factor;
 }
 
 
diff --git a/auxiliary/matrix_col/align1/inplace_sub.cl b/auxiliary/matrix_col/align1/inplace_sub.cl
index 2de7013..dc86b4c 100644
--- a/auxiliary/matrix_col/align1/inplace_sub.cl
+++ b/auxiliary/matrix_col/align1/inplace_sub.cl
@@ -1,10 +1,22 @@
 
-__kernel void inplace_sub(
-          __global float * vec1,
-          __global const float * vec2,
-          unsigned int size) 
+__kernel void inplace_sub( // A -= B
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,  
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols)
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec1[i] -= vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[i + A_row_start + (j + A_col_start) * A_internal_rows] -= B[i + B_row_start + (j + B_col_start) * B_internal_rows];
 }
 
diff --git a/auxiliary/matrix_col/align1/sub.cl b/auxiliary/matrix_col/align1/sub.cl
index 2156af8..4699866 100644
--- a/auxiliary/matrix_col/align1/sub.cl
+++ b/auxiliary/matrix_col/align1/sub.cl
@@ -1,11 +1,29 @@
 
-__kernel void sub(
-          __global const float * vec1,
-          __global const float * vec2, 
-          __global float * result,
-          unsigned int size)
+__kernel void sub(  // C = A - B
+          __global const float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          __global float * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    result[i] = vec1[i] - vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      C[i + C_row_start + (j + C_col_start) * C_internal_rows] =  A[i + A_row_start + (j + A_col_start) * A_internal_rows]
+                                                                  - B[i + B_row_start + (j + B_col_start) * B_internal_rows];
 }
-
diff --git a/auxiliary/matrix_row/align1/add.cl b/auxiliary/matrix_row/align1/add.cl
index 49bdd83..eae5ba5 100644
--- a/auxiliary/matrix_row/align1/add.cl
+++ b/auxiliary/matrix_row/align1/add.cl
@@ -1,11 +1,30 @@
 
-__kernel void add(
-          __global const float * vec1,
-          __global const float * vec2, 
-          __global float * result,
-          unsigned int size) 
+__kernel void add(  // C = A + B
+          __global const float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          __global float * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    result[i] = vec1[i] + vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      C[(i + C_row_start) * C_internal_cols + j + C_col_start] =  A[(i + A_row_start) * A_internal_cols + j + A_col_start]
+                                                                  + B[(i + B_row_start) * B_internal_cols + j + B_col_start];
 }
 
diff --git a/auxiliary/matrix_row/align1/inplace_add.cl b/auxiliary/matrix_row/align1/assign.cl
similarity index 61%
copy from auxiliary/matrix_row/align1/inplace_add.cl
copy to auxiliary/matrix_row/align1/assign.cl
index 08b7f7f..e4dce74 100644
--- a/auxiliary/matrix_row/align1/inplace_add.cl
+++ b/auxiliary/matrix_row/align1/assign.cl
@@ -1,5 +1,5 @@
 
-__kernel void inplace_add(
+__kernel void assign( // A <- B
           __global float * A,
           unsigned int A_row_start,
           unsigned int A_col_start,
@@ -15,12 +15,8 @@ __kernel void inplace_add(
           unsigned int B_internal_rows,
           unsigned int B_internal_cols)
 { 
-  if (   get_global_id(0) < A_row_size
-      && get_global_id(1) < A_col_size
-     )
-    A[  (get_global_id(0) + A_row_start) * A_internal_cols
-      + (get_global_id(1) + A_col_start)] 
-      += B[  (get_global_id(0) + B_row_start) * B_internal_cols
-           + (get_global_id(1) + B_col_start)];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] = B[(i + B_row_start) * B_internal_cols + j + B_col_start];
 }
 
diff --git a/auxiliary/matrix_row/align1/clear.cl b/auxiliary/matrix_row/align1/clear.cl
index d3a6e48..89806f2 100644
--- a/auxiliary/matrix_row/align1/clear.cl
+++ b/auxiliary/matrix_row/align1/clear.cl
@@ -1,9 +1,14 @@
 
-__kernel void clear(
-          __global float * vec,
-          unsigned int size) 
+__kernel void clear( // A <- 0
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] = 0;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] = 0;
 }
-
diff --git a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl b/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
index 0b0eeec..8785a53 100644
--- a/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
+++ b/auxiliary/matrix_row/align1/cpu_inplace_mult.cl
@@ -1,10 +1,15 @@
 
-__kernel void cpu_inplace_mult(
-          __global float * vec,
-          float factor, 
-          unsigned int size) 
+__kernel void cpu_inplace_mult( // A *= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          float factor) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] *= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] *= factor;
 }
-
diff --git a/auxiliary/matrix_row/align1/inplace_add.cl b/auxiliary/matrix_row/align1/inplace_add.cl
index 08b7f7f..2f28b34 100644
--- a/auxiliary/matrix_row/align1/inplace_add.cl
+++ b/auxiliary/matrix_row/align1/inplace_add.cl
@@ -1,5 +1,5 @@
 
-__kernel void inplace_add(
+__kernel void inplace_add( // A += B
           __global float * A,
           unsigned int A_row_start,
           unsigned int A_col_start,
@@ -15,12 +15,8 @@ __kernel void inplace_add(
           unsigned int B_internal_rows,
           unsigned int B_internal_cols)
 { 
-  if (   get_global_id(0) < A_row_size
-      && get_global_id(1) < A_col_size
-     )
-    A[  (get_global_id(0) + A_row_start) * A_internal_cols
-      + (get_global_id(1) + A_col_start)] 
-      += B[  (get_global_id(0) + B_row_start) * B_internal_cols
-           + (get_global_id(1) + B_col_start)];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] += B[(i + B_row_start) * B_internal_cols + j + B_col_start];
 }
 
diff --git a/auxiliary/matrix_row/align1/inplace_divide.cl b/auxiliary/matrix_row/align1/inplace_divide.cl
index 69ee2a8..42630d4 100644
--- a/auxiliary/matrix_row/align1/inplace_divide.cl
+++ b/auxiliary/matrix_row/align1/inplace_divide.cl
@@ -1,11 +1,16 @@
 
-__kernel void inplace_divide(
-          __global float * vec,
-          __global const float * fac,  //note: CPU variant is mapped to prod_scalar
-          unsigned int size) 
+__kernel void inplace_divide( // A /= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * fac) //note: CPU variant is mapped to prod_scalar
 { 
   float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] /= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] /= factor;
 }
-
diff --git a/auxiliary/matrix_row/align1/inplace_mult.cl b/auxiliary/matrix_row/align1/inplace_mult.cl
index 6112c6d..9246841 100644
--- a/auxiliary/matrix_row/align1/inplace_mult.cl
+++ b/auxiliary/matrix_row/align1/inplace_mult.cl
@@ -1,12 +1,18 @@
 
-__kernel void inplace_mult(
-          __global float * vec,
-          __global const float * fac, 
-          unsigned int size) 
+__kernel void inplace_mult( // A *= const
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * fac) 
 { 
   float factor = *fac;
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec[i] *= factor;
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] *= factor;
 }
 
 
diff --git a/auxiliary/matrix_row/align1/inplace_sub.cl b/auxiliary/matrix_row/align1/inplace_sub.cl
index 2de7013..97760f9 100644
--- a/auxiliary/matrix_row/align1/inplace_sub.cl
+++ b/auxiliary/matrix_row/align1/inplace_sub.cl
@@ -1,10 +1,21 @@
 
-__kernel void inplace_sub(
-          __global float * vec1,
-          __global const float * vec2,
-          unsigned int size) 
+__kernel void inplace_sub( // A -= B
+          __global float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,  
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols)
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    vec1[i] -= vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      A[(i + A_row_start) * A_internal_cols + j + A_col_start] -= B[(i + B_row_start) * B_internal_cols + j + B_col_start];
 }
-
diff --git a/auxiliary/matrix_row/align1/sub.cl b/auxiliary/matrix_row/align1/sub.cl
index 2156af8..5bd03a1 100644
--- a/auxiliary/matrix_row/align1/sub.cl
+++ b/auxiliary/matrix_row/align1/sub.cl
@@ -1,11 +1,29 @@
 
-__kernel void sub(
-          __global const float * vec1,
-          __global const float * vec2, 
-          __global float * result,
-          unsigned int size)
+__kernel void sub(  // C = A - B
+          __global const float * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          __global const float * B,
+          unsigned int B_row_start,
+          unsigned int B_col_start,
+          unsigned int B_row_size,
+          unsigned int B_col_size,
+          unsigned int B_internal_rows,
+          unsigned int B_internal_cols,
+          __global float * C,
+          unsigned int C_row_start,
+          unsigned int C_col_start,
+          unsigned int C_row_size,
+          unsigned int C_col_size,
+          unsigned int C_internal_rows,
+          unsigned int C_internal_cols) 
 { 
-  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))
-    result[i] = vec1[i] - vec2[i];
+  for (unsigned int i = get_global_id(0); i < A_row_size; i += get_global_size(0))
+    for (unsigned int j = get_global_id(1); j < A_col_size; j += get_global_size(1))
+      C[(i + C_row_start) * C_internal_cols + j + C_col_start] =  A[(i + A_row_start) * A_internal_cols + j + A_col_start]
+                                                                  - B[(i + B_row_start) * B_internal_cols + j + B_col_start];
 }
-
diff --git a/changelog b/changelog
new file mode 100644
index 0000000..bdfcbbc
--- /dev/null
+++ b/changelog
@@ -0,0 +1,132 @@
+******************************
+**** ViennaCL Change Logs ****
+******************************
+
+
+*** Version 1.2.x ***
+
+-- Version 1.2.1 --
+The current release mostly provides a few bug fixes for experimental features introduced in 1.2.0.
+In addition, performance improvements for matrix-matrix multiplications are applied.
+The main changes (in addition to some internal adjustments) are as follows:
+ - Fixed problems with double precision on AMD GPUs supporting cl_amd_fp64 instead of cl_khr_fp64 (thanks to Sylvain R.)
+ - Considerable improvements in the handling of matrix_range. Added project() function for convenience (cf. Boost.uBLAS)
+ - Further improvements of matrix-matrix multiplication performance (contributed by Volodymyr Kysenko)
+ - Improved performance of QR factorization
+ - Added direct element access to elements of compressed_matrix using operator() (thanks to sourceforge.net user Sulif for the hint)
+ - Fixed incorrect matrix dimensions obtained with the transfer of non-square sparse Eigen and MTL matrices to ViennaCL objects (thanks to sourceforge.net user ggrocca for pointing at this)
+
+-- Version 1.2.0 --
+
+Many new features from the Google Summer of Code and the IuE Summer of Code enter this release.
+Due to their complexity, they are for the moment still in experimental state (see the respective chapters for details) and are expected to reach maturity with the 1.3.0 release.
+Shorter release cycles are planned for the near future.
+ - Added a bunch of algebraic multigrid preconditioner variants (contributed by Markus Wagner)
+ - Added (factored) sparse approximate inverse preconditioner (SPAI, contributed by Nikolay Lukash)
+ - Added fast Fourier transform (FFT) for vector sizes with a power of two, tandard Fourier transform for other sizes (contributed by Volodymyr Kysenko)
+ - Additional structured matrix classes for circulant matrices, Hankel matrices, Toeplitz matrices and Vandermonde matrices (contributed by Volodymyr Kysenko)
+ - Added reordering algorithms (Cuthill-McKee and Gibbs-Poole-Stockmeyer, contributed by Philipp Grabenweger)
+ - Refurbished CMake build system (thanks to Michael Wild)
+ - Added matrix and vector proxy objects for submatrix and subvector manipulation
+ - Added (possibly GPU-assisted) QR factorization
+ - Per default, a viennacl::ocl::context now consists of one device only. The rationale is to provide better out-of-the-box support for machines with hybrid graphics (two GPUs), where one GPU may not be capable of double precision support.
+ - Fixed problems with viennacl::compressed_matrix which occurred if the number of rows and columns differed
+ - Improved documentation for the case of multiple custom kernels within a program
+ - Improved matrix-matrix multiplication kernels (may lead to up to 20 percent performance gains)
+ - Fixed problems in GMRES for small matrices (dimensions smaller than the maximum number of Krylov vectors)
+
+
+*** Version 1.1.x ***
+
+-- Version 1.1.2 --
+This final release of the ViennaCL 1.1.x family focuses on refurbishing existing functionality:
+ - Fixed a bug with partial vector copies from CPU to GPU (thanks to sourceforge.net user kaiwen).
+ - Corrected error estimations in CG and BiCGStab iterative solvers (thanks to Riccardo Rossi for the hint).
+ - Improved performance of CG and BiCGStab as well as Jacobi and row-scaling preconditioners considerably (thanks to Farshid Mossaiby and Riccardo Rossi for a lot of input).
+ - Corrected linker statements in CMakeLists.txt for MacOS (thanks to Eric Christiansen).
+ - Improved handling of ViennaCL types (direct construction, output streaming of matrix- and vector-expressions, etc.).
+ - Updated old code in the coordinate_matrix type and improved performance (thanks to Dongdong Li for finding this).
+ - Using size_t instead of unsigned int for the size type on the host.
+ - Updated double precision support detection for AMD hardware.
+ - Fixed a name clash in direct_solve.hpp and ilu.hpp (thanks to sourceforge.net user random).
+ - Prevented unsupported assignments and copies of sparse matrix types (thanks to sourceforge.net user kszyh).
+
+-- Version 1.1.1 --
+This new revision release has a focus on better interaction with other linear algebra libraries. The few known glitches with version 1.1.0 are now removed.
+ - Fixed compilation problems on MacOS X and OpenCL 1.0 header files due to undefined an preprocessor constant (thanks to Vlad-Andrei Lazar and Evan Bollig for reporting this)
+ - Removed the accidental external linkage for three functions (we appreciate the report by Gordon Stevenson).
+ - New out-of-the-box support for Eigen and MTL libraries. Iterative solvers from ViennaCL can now directly be used with both libraries.
+ - Fixed a problem with GMRES when system matrix is smaller than the maximum Krylov space dimension.
+ - Better default parameter for BLAS3 routines leads to higher performance for matrix-matrix-products.
+ - Added benchmark for dense matrix-matrix products (BLAS3 routines).
+ - Added viennacl-info example that displays infos about the OpenCL backend used by ViennaCL.
+ - Cleaned up CMakeLists.txt in order to selectively enable builds that rely on external libraries.
+ - More than one installed OpenCL platform is now allowed (thanks to Aditya Patel).
+
+
+-- Version 1.1.0 --
+A large number of new features and improvements over the 1.0.5 release are now available:
+ - The completely rewritten OpenCL back-end allows for multiple contexts, multiple devices and even to wrap existing OpenCL resources into ViennaCL objects. A tutorial demonstrates the new functionality. Thanks to Josip Basic for pushing us into that direction.
+ - The tutorials are now named according to their purpose.
+ - The dense matrix type now supports both row-major and column-major
+storage.
+ - Dense and sparse matrix types now now be filled using STL-emulated types (std::vector< std::vector<NumericT> > and std::vector< std::map< unsigned int, NumericT> >)
+ - BLAS level 3 functionality is now complete. We are very happy with the general out-of-the-box performance of matrix-matrix-products, even though it cannot beat the extremely tuned implementations tailored to certain matrix sizes on a particular device yet.
+ - An automated performance tuning environment allows an optimization of the kernel parameters for the library user's machine. Best parameters can be obtained from a tuning run and stored in a XML file and read at program startup using pugixml.
+ - Two now preconditioners are now included: A Jacobi preconditioner and a row-scaling preconditioner. In contrast to ILUT, they are applied on the OpenCL device directly.
+ - Clean compilation of all examples under Visual Studio 2005 (we recommend newer compilers though...).
+ - Error handling is now carried out using C++ exceptions.
+ - Matrix Market now uses index base 1 per default (thanks to Evan Bollig for reporting that)
+ - Improved performance of norm_X kernels.
+ - Iterative solver tags now have consistent constructors: First argument is the relative tolerance, second argument is the maximum number of total iterations. Other arguments depend on the respective solver.
+ - A few minor improvements here and there (thanks go to Riccardo Rossi and anonymous sourceforge.net users for reporting the issues)
+
+*** Version 1.0.x ***
+
+-- Version 1.0.5 --
+This is the last 1.0.x release. The main changes are as follows:
+ - Added a reader and writer for MatrixMarket files (thanks to Evan Bollig for suggesting that)
+ - Eliminated a bug that caused the upper triangular direct solver to fail on NVIDIA hardware for large matrices (thanks to Andrew Melfi for finding that)
+ - The number of iterations and the final estimated error can now be obtained from iterative solver tags.
+ - Improvements provided by Klaus Schnass are included in the developer converter script (OpenCL kernels to C++ header)
+ - Disabled the use of reference counting for OpenCL handles on Mac OS X (caused seg faults on program exit)
+
+-- Version 1.0.4 --
+The changes in this release are:
+ - All tutorials now work out-of the box with Visual Studio 2008.
+ - Eliminated all ViennaCL related warnings when compiling with Visual Studio 2008.
+ - Better (experimental) support for double precision on ATI GPUs, but no norm_1, norm_2, norm_inf and index_norm_inf functions using ATI Stream SDK on GPUs in double precision.
+ - Fixed a bug in GMRES that caused segmentation faults under Windows.
+ - Fixed a bug in const_sparse_matrix_adapter (thanks to Abhinav Golas and Nico Galoppo for almost simultaneous emails on that)
+ - Corrected incorrect return values in the sparse matrix regression test suite (thanks to Klaus Schnass for the hint)
+
+
+-- Version 1.0.3 --
+The main improvements in this release are:
+ - Support for multi-core CPUs with ATI Stream SDK (thanks to Riccardo Rossi, UPC. BARCELONA TECH, for suggesting this)
+ - inner_prod is now up to a factor of four faster (thanks to Serban Georgescu, ETH, for pointing the poor performance of the old implementation out)
+ - Fixed a bug with plane_rotation that caused system freezes with ATI GPUs.
+ - Extended the doxygen generated reference documentation 
+
+
+-- Version 1.0.2 --
+A bug-fix release that resolves some problems with the Visual C++ compiler.
+ - Fixed some compilation problems under Visual C++ (version 2005 and 2008).
+ - All tutorials accidentally relied on ublas. Now tut1 and tut5 can be compiled without ublas.
+ - Renamed aux/ folder to auxiliary/ (caused some problems on windows machines)
+
+-- Version 1.0.1 --
+This is a quite large revision of ViennaCL 1.0.0, but mainly improves things under the hood.
+ - Fixed a bug in lu_substitute for dense matrices
+ - Changed iterative solver behavior to stop if a certain relative residual is reached
+ - ILU preconditioning is now fully done on the CPU, because this gives best overall performance
+ - All OpenCL handles of ViennaCL types can now be accessed via member function handle()
+ - Improved GPU performance of GMRES by about a factor of two.
+ - Added generic norm_2 function in header file norm_2.hpp
+ - Wrapper for clFlush() and clFinish() added
+ - Device information can be queried by device.info()
+ - Extended documentation and tutorials
+
+-- Version 1.0.0 --
+First release
+
diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake
index e77ee3c..a237116 100644
--- a/cmake/FindOpenCL.cmake
+++ b/cmake/FindOpenCL.cmake
@@ -68,8 +68,10 @@ find_package_handle_standard_args(
   )
 
 if(OPENCL_FOUND)
+  set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
   set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
 else(OPENCL_FOUND)
+  set(OPENCL_INCLUDE_DIRS)
   set(OPENCL_LIBRARIES)
 endif(OPENCL_FOUND)
 
diff --git a/cmake/ViennaCLCommon.cmake b/cmake/ViennaCLCommon.cmake
index 5ae0a74..3939b4f 100644
--- a/cmake/ViennaCLCommon.cmake
+++ b/cmake/ViennaCLCommon.cmake
@@ -1,4 +1,11 @@
 
+# do not build tests by default, since they require Boost
+if (VIENNACL_SRC_DIST)
+ option(BUILD_TESTING "Build the tests " ON)
+else (VIENNACL_SRC_DIST)
+ option(BUILD_TESTING "Build the tests " OFF)
+endif(VIENNACL_SRC_DIST)
+
 include(CTest)
 include(CMakeDependentOption)
 
@@ -69,7 +76,7 @@ IF (BOOSTPATH)
 ENDIF (BOOSTPATH)
 
 
-if(BUILD_EXAMPLES OR BUILD_TESTING OR VIENNACL_SRC_DIST)
+if(ENABLE_UBLAS OR BUILD_TESTING OR VIENNACL_SRC_DIST)
    set(Boost_USE_MULTITHREADED TRUE)
    find_package(Boost REQUIRED COMPONENTS filesystem system)
 endif()
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 5cc3032..a4a1b32 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -31,7 +31,7 @@ PROJECT_NAME           = "ViennaCL - The Vienna Computing Library"
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.2.0
+PROJECT_NUMBER         = 1.2.1
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
diff --git a/doc/manual/algorithms.tex b/doc/manual/algorithms.tex
index 83c3270..b38a64a 100644
--- a/doc/manual/algorithms.tex
+++ b/doc/manual/algorithms.tex
@@ -386,3 +386,34 @@ The algorithms are called for a \lstinline|matrix| of a type compatible with \ls
 \end{lstlisting}
 and return the permutation array. In {\ViennaCLversion}, the user then needs to manually reorder the sparse matrix based on the permutation array. Example code
 can be found in \lstinline|examples/tutorial/bandwidth-reduction.cpp|.
+
+
+
+\section{QR Factorization}
+\NOTE{The QR factorization is experimental in {\ViennaCLversion}. Interface changes as well as considerable performance improvements may
+be included in future releases!}
+
+A matrix $A \in \mathbb{R}^{n\times m}$ can be factored into $A = Q R$, where $Q \in \mathbb{R}^{n\times n}$ is an
+orthogonal matrix and $R \in \mathbb{R}^{n \times m}$ is upper triangular. This so-called QR-factorization is important for eigenvalue computations as well as
+for the solution of least-squares problems \cite{golub:matrix-computations}. {\ViennaCL} provides a generic implementation of the QR-factorization using
+Householder reflections in file \lstinline|viennacl/linalg/qr.hpp|. An example application can be found in \lstinline|examples/tutorial/qr.hpp|.
+
+The Householder reflectors $v_i$ defining the Householder reflection $I - \beta_i v_i v_i^{\mathrm{T}}$ are stored in the
+columns below the diagonal of the input matrix $A$ \cite{golub:matrix-computations}. The normalization coefficients $\beta_i$ are returned by the
+worker function \lstinline|inplace_qr|. The upper triangular matrix $R$ is directly written to the upper triangular part of $A$. 
+\begin{lstlisting}
+  std::vector<ScalarType> betas = viennacl::linalg::inplace_qr(A, 12);
+\end{lstlisting}
+If $A$ is a dense matrix from \ublas, the calculation is carried out on the CPU using a single thread. If $A$ is a 
+\lstinline|viennacl::matrix|, a hybrid implementation is used: The panel factorization is carried out using \ublas, while expensive BLAS level 3 operations
+are computed on the OpenCL device using multiple threads. 
+
+\NOTE{The number of columns of the input matrix must be a multiple of the block size in {\ViennaCLversion}.}
+
+Typically, the orthogonal matrix $Q$ is kept in inplicit form because of computational efficiency
+However, if $Q$ and $R$ have to be computed explicitly, the function \lstinline|recoverQ| can be used:
+\begin{lstlisting}
+  viennacl::linalg::recoverQ(A, betas, Q, R); 
+\end{lstlisting}
+Here, \lstinline|A| is the inplace QR-factored matrix, \lstinline|betas| are the coefficients of the Householder reflectors as returned by
+\lstinline|inplace_qr|, while \lstinline|Q| and \lstinline|R| are the destination matrices.
diff --git a/doc/manual/changelogs.tex b/doc/manual/changelogs.tex
index 39e6c63..2eae9a8 100644
--- a/doc/manual/changelogs.tex
+++ b/doc/manual/changelogs.tex
@@ -3,6 +3,23 @@
 
 \section*{Version 1.2.x}
  
+\subsection*{Version 1.2.1}
+The current release mostly provides a few bug fixes for experimental features introduced in 1.2.0.
+In addition, performance improvements for matrix-matrix multiplications are applied.
+The main changes (in addition to some internal adjustments) are as follows:
+\begin{itemize}
+ \item Fixed problems with double precision on AMD GPUs supporting \lstinline|cl_amd_fp64| instead of \lstinline|cl_khr_fp64| (thanks to Sylvain R.)
+ \item Considerable improvements in the handling of \lstinline|matrix_range|. Added project() function for convenience (cf. Boost.uBLAS)
+ \item Further improvements of matrix-matrix multiplication performance (contributed by Volodymyr Kysenko)
+ \item Improved performance of QR factorization
+ \item Added direct element access to elements of \lstinline|compressed_matrix| using \lstinline|operator()| (thanks to sourceforge.net user Sulif for the hint)
+ \item Fixed incorrect matrix dimensions obtained with the transfer of non-square sparse Eigen and MTL matrices to ViennaCL objects (thanks to sourceforge.net user ggrocca for pointing at this)
+\end{itemize}
+
+
+
+
+
 \subsection*{Version 1.2.0}
 Many new features from the Google Summer of Code and the I$\mu$E Summer of Code enter this release.
 Due to their complexity, they are for the moment still in \textit{experimental} state (see the respective chapters for details) and are expected to reach maturity with the 1.3.0 release.
@@ -10,7 +27,7 @@ Shorter release cycles are planned for the near future.
 \begin{itemize}
  \item Added a bunch of algebraic multigrid preconditioner variants (contributed by Markus Wagner)
  \item Added (factored) sparse approximate inverse preconditioner (SPAI, contributed by Nikolay Lukash)
- \item Added fast Fourier transform (FFT) for vector sizes with a power of two, tandard Fourier transform for other sizes (contributed by Volodymyr Kysenko)
+ \item Added fast Fourier transform (FFT) for vector sizes with a power of two, standard Fourier transform for other sizes (contributed by Volodymyr Kysenko)
  \item Additional structured matrix classes for circulant matrices, Hankel matrices, Toeplitz matrices and Vandermonde matrices (contributed by Volodymyr Kysenko)
  \item Added reordering algorithms (Cuthill-McKee and Gibbs-Poole-Stockmeyer, contributed by Philipp Grabenweger)
  \item Refurbished CMake build system (thanks to Michael Wild)
diff --git a/doc/manual/cover.tex b/doc/manual/cover.tex
index 49fbb40..7482e64 100644
--- a/doc/manual/cover.tex
+++ b/doc/manual/cover.tex
@@ -2,7 +2,7 @@
 \begin{titlepage}
 
 \vspace*{3cm}
-\Huge{ViennaCL 1.2.0} 
+\Huge{ViennaCL 1.2.1} 
 \rule[0.0cm]{9.5cm}{0.05cm}
 \begin{flushright}
 \Large{User Manual}
diff --git a/doc/manual/operations.tex b/doc/manual/operations.tex
index 8236abd..205654a 100644
--- a/doc/manual/operations.tex
+++ b/doc/manual/operations.tex
@@ -43,11 +43,6 @@ The interface for level 2 BLAS functions in {\ViennaCL} is similar to that of
 \TIP{For full details on level 2 functions, refer to the reference documentation
 located in \texttt{doc/doxygen/}}
 
-\TIP{{\ViennaCL} is not only able to solve triangular matrices, as requested by
-BLAS, it provides several iterative solvers for the solution of large systems of
-equations. See Section \ref{sec:iterative-solvers} for more details on iterative
-solvers.}
-
 
 \begin{table}[tb]
 \begin{center}
diff --git a/doc/manual/viennacl.bib b/doc/manual/viennacl.bib
index 18987a0..6206b13 100644
--- a/doc/manual/viennacl.bib
+++ b/doc/manual/viennacl.bib
@@ -142,3 +142,11 @@
  pages = {190--194},
  publisher = {ACM},
 } 
+
+ at book{golub:matrix-computations,
+ author={Golub, G.~H. and Van Loan, C.~F.},
+ title = {Matrix Computations},
+ publisher = {John Hopkins University Press},
+ year = {1996}
+}
+
diff --git a/doc/manual/viennacl.tex b/doc/manual/viennacl.tex
index fbc4f36..4833a62 100644
--- a/doc/manual/viennacl.tex
+++ b/doc/manual/viennacl.tex
@@ -56,7 +56,7 @@
 \newcommand{\OpenMP} {\texttt{OpenMP}}
 \newcommand{\OpenCL} {\texttt{OpenCL}}
 \newcommand{\ViennaCL} {\texttt{ViennaCL}}
-\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.2.0}}
+\newcommand{\ViennaCLversion} {\texttt{ViennaCL 1.2.1}}
 \newcommand{\ViennaCLminorversion} {\texttt{ViennaCL 1.2.x}}
 \newcommand{\Boost} {\texttt{Boost}}
 \newcommand{\ublas} {\texttt{ublas}}
diff --git a/examples/benchmarks/CMakeLists.txt b/examples/benchmarks/CMakeLists.txt
index bfc5636..067d289 100644
--- a/examples/benchmarks/CMakeLists.txt
+++ b/examples/benchmarks/CMakeLists.txt
@@ -5,9 +5,13 @@ endforeach()
 
 if(ENABLE_UBLAS)
    include_directories(${Boost_INCLUDE_DIRS})
-   foreach(bench sparse solver)
+   foreach(bench sparse solver iccs_qr)
       add_executable(${bench}bench ${bench}.cpp)
       target_link_libraries(${bench}bench ${OPENCL_LIBRARIES})
    endforeach()
 endif()
 
+IF(CMAKE_COMPILER_IS_GNUCXX)
+   #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
+   ADD_DEFINITIONS(-Wall -pedantic -O3)
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/examples/benchmarks/benchmark-utils.hpp b/examples/benchmarks/benchmark-utils.hpp
index 05a5032..25db8c1 100644
--- a/examples/benchmarks/benchmark-utils.hpp
+++ b/examples/benchmarks/benchmark-utils.hpp
@@ -2,7 +2,7 @@
 #define _BENCHMARK_UTILS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/blas3.cpp b/examples/benchmarks/blas3.cpp
index f7bc073..583570e 100644
--- a/examples/benchmarks/blas3.cpp
+++ b/examples/benchmarks/blas3.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -103,7 +103,7 @@ int run_benchmark()
     viennacl::ocl::get_queue().finish();
     exec_time = timer.get();
     std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
-    std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
+    std::cout << " - GFLOPs (counting multiply&add as one operation): " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
     std::cout << std::endl;
   }
   
diff --git a/examples/benchmarks/iccs_qr.cpp b/examples/benchmarks/iccs_qr.cpp
new file mode 100644
index 0000000..c97dfcb
--- /dev/null
+++ b/examples/benchmarks/iccs_qr.cpp
@@ -0,0 +1,139 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+               
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#define VIENNACL_HAVE_UBLAS
+#ifndef NDEBUG
+ #define NDEBUG
+#endif
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include "benchmark-utils.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/io.hpp"
+
+           
+//typedef viennacl::compressed_matrix<float> SparseMatrix;
+using namespace boost::numeric::ublas;
+//using namespace viennacl::linalg;
+
+
+void run(size_t rows, size_t cols, std::size_t block_size, double with_cpu = true)
+{
+  typedef float               ScalarType;
+  typedef boost::numeric::ublas::matrix<ScalarType, boost::numeric::ublas::column_major>        MatrixType;
+  typedef boost::numeric::ublas::vector<ScalarType>                   VectorType;
+  
+  typedef viennacl::matrix<ScalarType, viennacl::column_major>   VCLMatrixType;
+  typedef viennacl::vector<ScalarType>   VCLVectorType;
+  
+  Timer timer;
+  double elapsed = 0;
+  
+  MatrixType A(rows, cols);
+
+  for (size_t i=0; i<rows; ++i)
+  {
+    for (size_t j=0; j<cols; ++j)
+    {
+      A(i,j) = 1.0 + (i + 1)*(j+1);
+    }
+  }
+  
+  VCLVectorType dummy(10);
+  VCLMatrixType vcl_A(rows, cols);
+    
+  viennacl::copy(A, vcl_A);
+
+  std::cout << "Benchmark size: " << rows << " x " << cols << std::endl;
+
+  
+  //
+  //  CPU:
+  //
+  
+  if (with_cpu)
+  {
+    timer.start();
+    std::vector<ScalarType> betas_cpu = viennacl::linalg::inplace_qr_ublas(A, block_size);
+    elapsed = timer.get();
+    std::cout << "Time for QR on CPU: " << elapsed << std::endl;
+  }
+  
+  //
+  //  GPU:
+  //
+  viennacl::ocl::get_queue().finish();
+  timer.start();
+  std::vector<ScalarType> betas_gpu = viennacl::linalg::inplace_qr_viennacl(vcl_A, block_size);
+  viennacl::ocl::get_queue().finish();
+  elapsed = timer.get();
+  std::cout << "Time for QR on GPU: " << elapsed << std::endl;
+    
+  //
+  //  Hybrid:
+  //
+  viennacl::ocl::get_queue().finish();
+  timer.start();
+  std::vector<ScalarType> betas_hybrid = viennacl::linalg::inplace_qr_hybrid(vcl_A, block_size);
+  viennacl::ocl::get_queue().finish();
+  elapsed = timer.get();
+  std::cout << "Time for QR on CPU/GPU: " << elapsed << std::endl;
+  
+}
+
+
+
+
+int main (int argc, const char * argv[])
+{
+   run(200, 200, 20);    
+   
+   std::size_t max_size = 3200;
+ 
+   std::cout << "---- block size: 20 -----" << std::endl;
+   for (std::size_t i=100; i<=max_size; i*=2)
+     run(i, i, 20);    
+
+   std::cout << "---- block size: 50 -----" << std::endl;
+   for (std::size_t i=100; i<=max_size; i*=2)
+     run(i, i, 50);    
+    
+   std::cout << "---- block size: 100 -----" << std::endl;
+   for (std::size_t i=100; i<=max_size; i*=2)
+     run(i, i, 100);    
+    
+   
+   return EXIT_SUCCESS;
+}
+
diff --git a/examples/benchmarks/io.hpp b/examples/benchmarks/io.hpp
index 96b1d12..2f83637 100644
--- a/examples/benchmarks/io.hpp
+++ b/examples/benchmarks/io.hpp
@@ -2,7 +2,7 @@
 #define VECTOR_IO_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/opencl.cpp b/examples/benchmarks/opencl.cpp
index 776fdca..ca0b791 100644
--- a/examples/benchmarks/opencl.cpp
+++ b/examples/benchmarks/opencl.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/qr.cpp b/examples/benchmarks/qr.cpp
index 598100e..b0b96ca 100644
--- a/examples/benchmarks/qr.cpp
+++ b/examples/benchmarks/qr.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/solver.cpp b/examples/benchmarks/solver.cpp
index 562b102..f676dfc 100644
--- a/examples/benchmarks/solver.cpp
+++ b/examples/benchmarks/solver.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/sparse.cpp b/examples/benchmarks/sparse.cpp
index 6832715..471a06f 100644
--- a/examples/benchmarks/sparse.cpp
+++ b/examples/benchmarks/sparse.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/benchmarks/vector.cpp b/examples/benchmarks/vector.cpp
index f079b7e..45766e1 100644
--- a/examples/benchmarks/vector.cpp
+++ b/examples/benchmarks/vector.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/benchmark-utils.hpp b/examples/parameters/benchmark-utils.hpp
index 05a5032..25db8c1 100644
--- a/examples/parameters/benchmark-utils.hpp
+++ b/examples/parameters/benchmark-utils.hpp
@@ -2,7 +2,7 @@
 #define _BENCHMARK_UTILS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/common.hpp b/examples/parameters/common.hpp
index a7cb07e..ad55fe3 100644
--- a/examples/parameters/common.hpp
+++ b/examples/parameters/common.hpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/common_vprof.hpp b/examples/parameters/common_vprof.hpp
index bd9548c..c39aad3 100644
--- a/examples/parameters/common_vprof.hpp
+++ b/examples/parameters/common_vprof.hpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/matrix.cpp b/examples/parameters/matrix.cpp
index c4591b3..0ea7be0 100644
--- a/examples/parameters/matrix.cpp
+++ b/examples/parameters/matrix.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/matrix_functors.hpp b/examples/parameters/matrix_functors.hpp
index 726b474..bfde5ae 100644
--- a/examples/parameters/matrix_functors.hpp
+++ b/examples/parameters/matrix_functors.hpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/parameter_reader.cpp b/examples/parameters/parameter_reader.cpp
index e251b0b..52a92af 100644
--- a/examples/parameters/parameter_reader.cpp
+++ b/examples/parameters/parameter_reader.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/sparse.cpp b/examples/parameters/sparse.cpp
index 2181387..1ebead4 100644
--- a/examples/parameters/sparse.cpp
+++ b/examples/parameters/sparse.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/vector.cpp b/examples/parameters/vector.cpp
index 99ef6a7..31953a9 100644
--- a/examples/parameters/vector.cpp
+++ b/examples/parameters/vector.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/parameters/vector_functors.hpp b/examples/parameters/vector_functors.hpp
index 78c2072..d3285e0 100644
--- a/examples/parameters/vector_functors.hpp
+++ b/examples/parameters/vector_functors.hpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/CMakeLists.txt b/examples/tutorial/CMakeLists.txt
index e4fd771..1de176b 100644
--- a/examples/tutorial/CMakeLists.txt
+++ b/examples/tutorial/CMakeLists.txt
@@ -5,7 +5,7 @@ endforeach()
 
 if(ENABLE_UBLAS)
    include_directories(${Boost_INCLUDE_DIRS})
-   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai structured-matrices vector-range)
+   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai sparse structured-matrices vector-range)
       add_executable(${tut} ${tut}.cpp)
       target_link_libraries(${tut} ${OPENCL_LIBRARIES})
    endforeach()
diff --git a/examples/tutorial/CMakeLists.txt b/examples/tutorial/CMakeLists.txt~
similarity index 81%
copy from examples/tutorial/CMakeLists.txt
copy to examples/tutorial/CMakeLists.txt~
index e4fd771..8772248 100644
--- a/examples/tutorial/CMakeLists.txt
+++ b/examples/tutorial/CMakeLists.txt~
@@ -5,7 +5,7 @@ endforeach()
 
 if(ENABLE_UBLAS)
    include_directories(${Boost_INCLUDE_DIRS})
-   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai structured-matrices vector-range)
+   foreach(tut amg blas2 blas3 iterative iterative-ublas matrix-range qr spai sparse structured-matrices vector-range)
       add_executable(${tut} ${tut}.cpp)
       target_link_libraries(${tut} ${OPENCL_LIBRARIES})
    endforeach()
@@ -25,3 +25,8 @@ if(ENABLE_MTL4)
       target_link_libraries(${tut} ${OPENCL_LIBRARIES})
    endforeach()
 endif()
+
+IF(CMAKE_COMPILER_IS_GNUCXX)
+   #ADD_DEFINITIONS(-Wall -pedantic -O0 -g)
+   ADD_DEFINITIONS(-Wall -pedantic -O3)
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/examples/tutorial/Random.hpp b/examples/tutorial/Random.hpp
index 613c937..93d37ca 100644
--- a/examples/tutorial/Random.hpp
+++ b/examples/tutorial/Random.hpp
@@ -1,6 +1,6 @@
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/amg.cpp b/examples/tutorial/amg.cpp
index ffd4b37..e7b8212 100755
--- a/examples/tutorial/amg.cpp
+++ b/examples/tutorial/amg.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/bandwidth-reduction.cpp b/examples/tutorial/bandwidth-reduction.cpp
index 52c4297..5f0a057 100644
--- a/examples/tutorial/bandwidth-reduction.cpp
+++ b/examples/tutorial/bandwidth-reduction.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -249,7 +249,7 @@ int main(int argc, char *argv[])
   //
   std::cout << "-- Advanced Cuthill-McKee algorithm --" << std::endl;
   double a = 0.0;
-  double gmax = 1;
+  std::size_t gmax = 1;
   r = viennacl::reorder(matrix2, viennacl::advanced_cuthill_mckee_tag(a, gmax));
   std::cout << " * Reordered bandwidth: " << calc_reordered_bw(matrix2, r) << std::endl;
   
diff --git a/examples/tutorial/blas1.cpp b/examples/tutorial/blas1.cpp
index 9c51c37..e9c52d2 100644
--- a/examples/tutorial/blas1.cpp
+++ b/examples/tutorial/blas1.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/blas2.cpp b/examples/tutorial/blas2.cpp
index bc7c5f1..9b57ef4 100644
--- a/examples/tutorial/blas2.cpp
+++ b/examples/tutorial/blas2.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/blas3.cpp b/examples/tutorial/blas3.cpp
index 94dbbff..db17e25 100644
--- a/examples/tutorial/blas3.cpp
+++ b/examples/tutorial/blas3.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/blas3range.cpp b/examples/tutorial/blas3range.cpp
index 1dfcf01..d7e376e 100644
--- a/examples/tutorial/blas3range.cpp
+++ b/examples/tutorial/blas3range.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/custom-context.cpp b/examples/tutorial/custom-context.cpp
index 602b567..0625843 100644
--- a/examples/tutorial/custom-context.cpp
+++ b/examples/tutorial/custom-context.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -212,7 +212,7 @@ int main()
   // Proof that ViennaCL really uses the new context:
   //
   std::cout << "Existing context: " << my_context << std::endl;
-  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle() << std::endl;
+  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;
 
   //
   // Wrap existing OpenCL objects into ViennaCL:
diff --git a/examples/tutorial/custom-kernels.cpp b/examples/tutorial/custom-kernels.cpp
index 135db1b..920c294 100644
--- a/examples/tutorial/custom-kernels.cpp
+++ b/examples/tutorial/custom-kernels.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/eigen-with-viennacl.cpp b/examples/tutorial/eigen-with-viennacl.cpp
index 5c1aa3b..d501b53 100644
--- a/examples/tutorial/eigen-with-viennacl.cpp
+++ b/examples/tutorial/eigen-with-viennacl.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -104,33 +104,33 @@ void run_test()
   //
   // Create and fill dense matrices from the Eigen library:
   //
-  EigenMatrix eigen_densemat(5, 5);
-  EigenMatrix eigen_densemat2(5, 5);
+  EigenMatrix eigen_densemat(6, 5);
+  EigenMatrix eigen_densemat2(6, 5);
   eigen_densemat(0,0) = 2.0;   eigen_densemat(0,1) = -1.0;
   eigen_densemat(1,0) = -1.0;  eigen_densemat(1,1) =  2.0;  eigen_densemat(1,2) = -1.0;
   eigen_densemat(2,1) = -1.0;  eigen_densemat(2,2) = -1.0;  eigen_densemat(2,3) = -1.0;
   eigen_densemat(3,2) = -1.0;  eigen_densemat(3,3) =  2.0;  eigen_densemat(3,4) = -1.0;
-                               eigen_densemat(4,4) = -1.0;  eigen_densemat(4,4) = -1.0;
+                               eigen_densemat(5,4) = -1.0;  eigen_densemat(4,4) = -1.0;
 
   //
   // Create and fill sparse matrices from the Eigen library:
   //
-  Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat(5, 5);
-  Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat2(5, 5);
+  Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat(6, 5);
+  Eigen::SparseMatrix<ScalarType, Eigen::RowMajor> eigen_sparsemat2(6, 5);
   eigen_sparsemat.startFill(5*2);
   eigen_sparsemat.fill(0,0) = 2.0;   eigen_sparsemat.fill(0,1) = -1.0;
   eigen_sparsemat.fill(1,1) = 2.0;   eigen_sparsemat.fill(1,2) = -1.0;
   eigen_sparsemat.fill(2,2) = -1.0;  eigen_sparsemat.fill(2,3) = -1.0;
   eigen_sparsemat.fill(3,3) = 2.0;   eigen_sparsemat.fill(3,4) = -1.0;
-  eigen_sparsemat.fill(4,4) = -1.0;
+  eigen_sparsemat.fill(5,4) = -1.0;
   eigen_sparsemat.endFill();
   
   //
   // Create and fill a few vectors from the Eigen library:
   //
   EigenVector eigen_rhs(5);
-  EigenVector eigen_result(5);
-  EigenVector eigen_temp(5);
+  EigenVector eigen_result(6);
+  EigenVector eigen_temp(6);
 
   eigen_rhs(0) = 10.0;
   eigen_rhs(1) = 11.0;
@@ -143,9 +143,9 @@ void run_test()
   // Let us create the ViennaCL analogues:
   //
   viennacl::vector<ScalarType> vcl_rhs(5);
-  viennacl::vector<ScalarType> vcl_result(5);
-  viennacl::matrix<ScalarType> vcl_densemat(5, 5);
-  viennacl::compressed_matrix<ScalarType> vcl_sparsemat(5, 5);
+  viennacl::vector<ScalarType> vcl_result(6);
+  viennacl::matrix<ScalarType> vcl_densemat(6, 5);
+  viennacl::compressed_matrix<ScalarType> vcl_sparsemat(6, 5);
   
   
   //
@@ -156,6 +156,7 @@ void run_test()
   
   viennacl::copy(eigen_densemat, vcl_densemat);
   viennacl::copy(eigen_sparsemat, vcl_sparsemat);
+  std::cout << "VCL sparsematrix dimensions: " << vcl_sparsemat.size1() << ", " << vcl_sparsemat.size2() << std::endl;
   
   // For completeness: Copy matrices from ViennaCL back to Eigen:
   viennacl::copy(vcl_densemat, eigen_densemat2);
diff --git a/examples/tutorial/fft.cpp b/examples/tutorial/fft.cpp
index f0a6002..6150625 100644
--- a/examples/tutorial/fft.cpp
+++ b/examples/tutorial/fft.cpp
@@ -1,6 +1,6 @@
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/iterative-eigen.cpp b/examples/tutorial/iterative-eigen.cpp
index 85ccf66..f9f7255 100644
--- a/examples/tutorial/iterative-eigen.cpp
+++ b/examples/tutorial/iterative-eigen.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/iterative-mtl4.cpp b/examples/tutorial/iterative-mtl4.cpp
index 0b13760..5a05d4b 100644
--- a/examples/tutorial/iterative-mtl4.cpp
+++ b/examples/tutorial/iterative-mtl4.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/iterative-ublas.cpp b/examples/tutorial/iterative-ublas.cpp
index c4f3157..24e52b8 100644
--- a/examples/tutorial/iterative-ublas.cpp
+++ b/examples/tutorial/iterative-ublas.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/iterative.cpp b/examples/tutorial/iterative.cpp
index 4a7ee58..6b2ed06 100644
--- a/examples/tutorial/iterative.cpp
+++ b/examples/tutorial/iterative.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/matrix-range.cpp b/examples/tutorial/matrix-range.cpp
index bdc91f4..51493dc 100644
--- a/examples/tutorial/matrix-range.cpp
+++ b/examples/tutorial/matrix-range.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/mtl4-with-viennacl.cpp b/examples/tutorial/mtl4-with-viennacl.cpp
index e32c3dd..12f4ad1 100644
--- a/examples/tutorial/mtl4-with-viennacl.cpp
+++ b/examples/tutorial/mtl4-with-viennacl.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/qr.cpp b/examples/tutorial/qr.cpp
index dd9d118..a355809 100644
--- a/examples/tutorial/qr.cpp
+++ b/examples/tutorial/qr.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -34,8 +34,55 @@
 #include <boost/numeric/ublas/matrix.hpp>
 #include <boost/numeric/ublas/io.hpp>
 
+
+//
+// Testing
+//
+#include "viennacl/range.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+
+//
+// A helper function checking the result
+//
+template <typename MatrixType>
+double check(MatrixType const & qr, MatrixType const & ref)
+{
+  bool do_break = false;
+  double max_error = 0;
+  for (std::size_t i=0; i<ref.size1(); ++i)
+  {
+    for (std::size_t j=0; j<ref.size2(); ++j)
+    {
+      if (qr(i,j) != 0.0 && ref(i,j) != 0.0)
+      {
+        double rel_err = fabs(qr(i,j) - ref(i,j)) / fabs(ref(i,j) );
+        
+        if (rel_err > max_error)
+          max_error = rel_err;
+      }
+      
+      
+      if (qr(i,j) != qr(i,j))
+      {
+        std::cout << "!!!" << std::endl;
+        std::cout << "!!! NaN detected at i=" << i << " and j=" << j << std::endl;
+        std::cout << "!!!" << std::endl;
+        do_break = true;
+        break;
+      }
+    }
+    if (do_break)
+      break;
+  }
+  return max_error;
+}
+
+
 /*
-*   Tutorial: QR factorization of host-based (ublas-)matrices
+*   Tutorial: QR factorization of matrices from ViennaCL or Boost.uBLAS
 */
 
 int main (int argc, const char * argv[])
@@ -43,52 +90,99 @@ int main (int argc, const char * argv[])
   typedef float               ScalarType;     //feel free to change this to 'double' if supported by your hardware
   typedef boost::numeric::ublas::matrix<ScalarType>        MatrixType;
   typedef boost::numeric::ublas::vector<ScalarType>        VectorType;
+  typedef viennacl::matrix<ScalarType, viennacl::column_major>        VCLMatrixType;
+  typedef viennacl::vector<ScalarType>        VCLVectorType;
 
-  size_t rows = 7;
-  size_t cols = 6;
+  std::size_t rows = 36;   //number of rows in the matrix
+  std::size_t cols = 48;   //number of columns
   
   //
   // Create matrices with some data
   //
-  MatrixType A(rows, cols);
+  MatrixType ublas_A(rows, cols);
   MatrixType Q(rows, rows);
   MatrixType R(rows, cols);
   
-  for (size_t i=0; i<rows; ++i)
+  // Some random data with a bit of extra weight on the diagonal
+  for (std::size_t i=0; i<rows; ++i)
   {
-    for (size_t j=0; j<cols; ++j)
+    for (std::size_t j=0; j<cols; ++j)
     {
-      A(i,j) = 1.0 + (i + 1)*(j+1);
+      ublas_A(i,j) = -1.0 + (i + 1)*(j+1)
+                     + ( (rand() % 1000) - 500.0) / 1000.0;
+
+      if (i == j)
+        ublas_A(i,j) += 10.0;
+                     
       R(i,j) = 0.0;
     }
     
-    for (size_t j=0; j<rows; ++j)
+    for (std::size_t j=0; j<rows; ++j)
       Q(i,j) = 0.0;
   }
   
-  // Print matrix before factorization
-  std::cout << "A: " << A << std::endl << std::endl;
+  // keep initial input matrix for comparison
+  MatrixType ublas_A_backup(ublas_A);
+  
+  
+  //
+  // Setup the matrix in ViennaCL:
+  //
+  VCLVectorType dummy(10);
+  VCLMatrixType vcl_A(ublas_A.size1(), ublas_A.size2());
+  
+  viennacl::copy(ublas_A, vcl_A);
   
   //
   // Compute QR factorization of A. A is overwritten with Householder vectors. Coefficients are returned and a block size of 3 is used.
   // Note that at the moment the number of columns of A must be divisible by the block size
   //
-  std::vector<ScalarType> betas = viennacl::linalg::inplace_qr(A, 3);
+
+  std::cout << "--- Boost.uBLAS ---" << std::endl;
+  std::vector<ScalarType> ublas_betas = viennacl::linalg::inplace_qr(ublas_A, 12);  //computes the QR factorization
+  
+  //
+  // A check for the correct result:
+  //
+  viennacl::linalg::recoverQ(ublas_A, ublas_betas, Q, R); 
+  MatrixType ublas_QR = prod(Q, R);
+  double ublas_error = check(ublas_QR, ublas_A_backup);
+  std::cout << "Max rel error (ublas): " << ublas_error << std::endl;
+  
+  //
+  // QR factorization in ViennaCL using OpenCL only
+  //
+  std::cout << "--- ViennaCL only ---" << std::endl;
+  std::vector<ScalarType> viennacl_betas = viennacl::linalg::inplace_qr_viennacl(vcl_A, 12); //this is a OpenCL-only implementation
+  viennacl::copy(vcl_A, ublas_A);
   
-  std::cout << "Inplace QR-factored A: " << A << std::endl << std::endl;
+  //
+  // A check for the correct result:
+  //
+  Q.clear(); R.clear();
+  viennacl::linalg::recoverQ(ublas_A, viennacl_betas, Q, R); 
+  double vcl_error = check(ublas_QR, ublas_A_backup);
+  std::cout << "Max rel error (ViennaCL): " << vcl_error << std::endl;
+
   
   //
-  // Recover full matrix orthogonal matrix Q and upper right matrix R
+  // QR factorization in ViennaCL using Boost.uBLAS for the panel factorization
   //
-  viennacl::linalg::recoverQ(A, betas, Q, R); 
+  std::cout << "--- Hybrid (default) ---" << std::endl;
+  viennacl::copy(ublas_A_backup, vcl_A);
+  std::vector<ScalarType> hybrid_betas = viennacl::linalg::inplace_qr(vcl_A, 12);
+  
+
+  //
+  // A check for the correct result:
+  //
+  viennacl::copy(vcl_A, ublas_A);
+  Q.clear(); R.clear();
+  viennacl::linalg::recoverQ(ublas_A, hybrid_betas, Q, R); 
+  double hybrid_error = check(ublas_QR, ublas_A_backup);
+  std::cout << "Max rel error (hybrid): " << hybrid_error << std::endl;
 
-  // Print for verification purposes:
-  std::cout << "R after recovery: " << R << std::endl << std::endl;
-  std::cout << "Q after recovery: " << Q << std::endl << std::endl;
-  std::cout << "Q*Q^T: " << prod(Q, trans(Q)) << std::endl << std::endl;  //This should be identity up to round-off errors
   
-  std::cout << "Q * R: " << prod(Q, R) << std::endl << std::endl; // this should be the initial matrix A up to round-off errors
-    
   //
   //  That's it.
   //
diff --git a/examples/tutorial/spai.cpp b/examples/tutorial/spai.cpp
index b2b339d..3bdd395 100644
--- a/examples/tutorial/spai.cpp
+++ b/examples/tutorial/spai.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/sparse.cpp b/examples/tutorial/sparse.cpp
new file mode 100644
index 0000000..5994e19
--- /dev/null
+++ b/examples/tutorial/sparse.cpp
@@ -0,0 +1,114 @@
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp at iue.tuwien.ac.at
+               
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+//
+// include necessary system headers
+//
+#include <iostream>
+
+
+//
+// ublas includes
+//
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/operation.hpp>
+#include <boost/numeric/ublas/operation_sparse.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+
+// Must be set if you want to use ViennaCL algorithms on ublas objects
+#define VIENNACL_HAVE_UBLAS 1
+
+
+//
+// ViennaCL includes
+//
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/coordinate_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/jacobi_precond.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/bicgstab.hpp"
+#include "viennacl/linalg/gmres.hpp"
+#include "viennacl/io/matrix_market.hpp"
+
+
+// Some helper functions for this tutorial:
+#include "Random.hpp"
+#include "vector-io.hpp"
+
+
+/*
+*
+*   Tutorial:  Modification of sparse matrices
+*   
+*/
+using namespace boost::numeric;
+
+
+int main()
+{
+  typedef float       ScalarType;
+  
+  std::size_t size = 5;
+  
+  //
+  // Set up some ublas objects
+  //
+  ublas::vector<ScalarType> rhs(size, size);
+  ublas::compressed_matrix<ScalarType> ublas_matrix(size, size);
+  
+  ublas_matrix(0,0) =  2.0; ublas_matrix(0,1) = -1.0;
+  ublas_matrix(1,0) = -1.0; ublas_matrix(1,1) =  2.0; ublas_matrix(1,2) = -1.0;
+  ublas_matrix(2,1) = -1.0; ublas_matrix(2,2) =  2.0; ublas_matrix(2,3) = -1.0;
+  ublas_matrix(3,2) = -1.0; ublas_matrix(3,3) =  2.0; ublas_matrix(3,4) = -1.0;
+  ublas_matrix(4,3) = -1.0; ublas_matrix(4,4) =  2.0;
+
+  //
+  // Set up some ViennaCL objects
+  //
+  viennacl::vector<ScalarType> vcl_rhs(size); 
+  viennacl::compressed_matrix<ScalarType> vcl_compressed_matrix(size, size);
+
+  viennacl::copy(ublas_matrix, vcl_compressed_matrix);
+  
+  std::cout << "ublas: " << ublas_matrix << std::endl;
+
+  std::cout << "Modifying vcl_compressed_matrix a bit: " << std::endl;
+  vcl_compressed_matrix(0, 0) = 3.0;
+  vcl_compressed_matrix(2, 3) = -3.0;
+  vcl_compressed_matrix(4, 2) = -3.0;  //this is a new nonzero entry
+  vcl_compressed_matrix(4, 3) = -3.0;
+  
+  ublas::compressed_matrix<ScalarType> temp(size, size);
+  viennacl::copy(vcl_compressed_matrix, temp);
+  std::cout << "ViennaCL: " << temp << std::endl;
+  
+  //
+  //  That's it.
+  //
+  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
+  
+  return 0;
+}
+
diff --git a/examples/tutorial/structured-matrices.cpp b/examples/tutorial/structured-matrices.cpp
index 112a7c4..a31d982 100644
--- a/examples/tutorial/structured-matrices.cpp
+++ b/examples/tutorial/structured-matrices.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/vector-io.hpp b/examples/tutorial/vector-io.hpp
index 0b74393..e7b9940 100644
--- a/examples/tutorial/vector-io.hpp
+++ b/examples/tutorial/vector-io.hpp
@@ -2,7 +2,7 @@
 #define VECTOR_IO_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/vector-range.cpp b/examples/tutorial/vector-range.cpp
index 8db6579..2214960 100644
--- a/examples/tutorial/vector-range.cpp
+++ b/examples/tutorial/vector-range.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/examples/tutorial/viennacl-info.cpp b/examples/tutorial/viennacl-info.cpp
index 7278ab7..dc01c32 100644
--- a/examples/tutorial/viennacl-info.cpp
+++ b/examples/tutorial/viennacl-info.cpp
@@ -1,5 +1,5 @@
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/tests/src/matrix.cpp b/tests/src/matrix.cpp
index b51c36c..6962959 100644
--- a/tests/src/matrix.cpp
+++ b/tests/src/matrix.cpp
@@ -112,7 +112,7 @@ int test(Epsilon const& epsilon)
    for (unsigned int i = 0; i < rhs.size(); ++i)
      rhs(i) = random<NumericT>();
    ublas::vector<NumericT> rhs2 = rhs;
-   ublas::vector<NumericT> result = ublas::scalar_vector<NumericT>(num_cols, 3.1415);
+   ublas::vector<NumericT> result = ublas::scalar_vector<NumericT>(num_cols, NumericT(3.1415));
    ublas::vector<NumericT> result2 = result;
    ublas::vector<NumericT> rhs_trans = rhs;
    rhs_trans.resize(result.size(), true);
@@ -461,7 +461,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-3;
+      NumericT epsilon = NumericT(1.0E-3);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
@@ -477,7 +477,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-3;
+      NumericT epsilon = NumericT(1.0E-3);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
diff --git a/tests/src/matrix_range.cpp b/tests/src/matrix_range.cpp
index 4c1e816..c0ff983 100644
--- a/tests/src/matrix_range.cpp
+++ b/tests/src/matrix_range.cpp
@@ -59,6 +59,9 @@ bool check_for_equality(MatrixType const & ublas_A, VCLMatrixType const & vcl_A)
       if (ublas_A(i,j) != vcl_A_cpu(i,j))
       {
         std::cout << "Error at index (" << i << ", " << j << "): " << ublas_A(i,j) << " vs " << vcl_A_cpu(i,j) << std::endl;
+        //std::cout << ublas_A << std::endl;
+        //std::cout << vcl_A_cpu << std::endl;
+        return false;
       }
     }
   }
@@ -73,31 +76,35 @@ int run_test()
     //typedef float               ScalarType;
     typedef boost::numeric::ublas::matrix<ScalarType>       MatrixType;
     
-    typedef viennacl::matrix<ScalarType, viennacl::row_major>    VCLMatrixType;
+    typedef viennacl::matrix<ScalarType, T>    VCLMatrixType;
+    
+    viennacl::scalar<ScalarType> gpu_pi = ScalarType(3.1415);
     
     std::size_t dim_large = 51;
     std::size_t dim_small = 37;
+    //std::size_t dim_large = 5;
+    //std::size_t dim_small = 3;
     
     //setup ublas objects:
     MatrixType ublas_A(dim_large, dim_large);
     for (std::size_t i=0; i<ublas_A.size1(); ++i)
       for (std::size_t j=0; j<ublas_A.size2(); ++j)
-        ublas_A(i,j) = (i+1) + (j+1)*(i+1);
+        ublas_A(i,j) = ScalarType((i+1) + (j+1)*(i+1));
 
     MatrixType ublas_B(dim_small, dim_small);
     for (std::size_t i=0; i<ublas_B.size1(); ++i)
       for (std::size_t j=0; j<ublas_B.size2(); ++j)
-        ublas_B(i,j) = (i+1) + (j+1)*(i+1);
+        ublas_B(i,j) = ScalarType((i+1) + (j+1)*(i+1));
 
     MatrixType ublas_C(dim_large, dim_small);
     for (std::size_t i=0; i<ublas_C.size1(); ++i)
       for (std::size_t j=0; j<ublas_C.size2(); ++j)
-        ublas_C(i,j) = (j+2) + (j+1)*(i+1);
+        ublas_C(i,j) = ScalarType((j+2) + (j+1)*(i+1));
 
     MatrixType ublas_D(dim_small, dim_large);
     for (std::size_t i=0; i<ublas_D.size1(); ++i)
       for (std::size_t j=0; j<ublas_D.size2(); ++j)
-        ublas_D(i,j) = (j+2) + (j+1)*(i+1);
+        ublas_D(i,j) = ScalarType((j+2) + (j+1)*(i+1));
       
     boost::numeric::ublas::range ublas_r1(0, dim_small);
     boost::numeric::ublas::range ublas_r2(dim_large - dim_small, dim_large);
@@ -218,14 +225,15 @@ int run_test()
     
     
     std::cout << "//" << std::endl;
-    std::cout << "////////// Test 2: Inplace add //////////" << std::endl;
+    std::cout << "////////// Test 3: Addition //////////" << std::endl;
     std::cout << "//" << std::endl;
     viennacl::copy(ublas_A_sub2, vcl_A_sub2);
     
+    std::cout << "Inplace add to submatrix: ";
     ublas_A_sub2 += ublas_A_sub2;
     vcl_A_sub2 += vcl_A_sub2;
 
-    if (check_for_equality(ublas_A_sub2, vcl_A_sub2))
+    if (check_for_equality(ublas_A, vcl_A))
       std::cout << "PASSED!" << std::endl;
     else
     {
@@ -237,6 +245,71 @@ int run_test()
     ublas_B += ublas_A_sub2;
     vcl_B += vcl_A_sub2;
 
+    if (check_for_equality(ublas_B, vcl_B))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    std::cout << "Add to submatrix: ";
+    ublas_A_sub2 = ublas_A_sub2 + ublas_A_sub2;
+    vcl_A_sub2 = vcl_A_sub2 + vcl_A_sub2;
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Add to matrix: ";
+    ublas_B = ublas_A_sub2 + ublas_A_sub2;
+    vcl_B = vcl_A_sub2 + vcl_A_sub2;
+
+    if (check_for_equality(ublas_B, vcl_B))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test 4: Subtraction //////////" << std::endl;
+    std::cout << "//" << std::endl;
+    viennacl::copy(ublas_A_sub2, vcl_A_sub2);
+    
+    std::cout << "Inplace add to submatrix: ";
+    ublas_A_sub2 -= ublas_A_sub2;
+    vcl_A_sub2 -= vcl_A_sub2;
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Inplace add to matrix: ";
+    ublas_B -= ublas_A_sub2;
+    vcl_B -= vcl_A_sub2;
+
+    if (check_for_equality(ublas_B, vcl_B))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    std::cout << "Add to submatrix: ";
+    ublas_A_sub2 = ublas_A_sub2 - ublas_A_sub2;
+    vcl_A_sub2 = vcl_A_sub2 - vcl_A_sub2;
+
     if (check_for_equality(ublas_A, vcl_A))
       std::cout << "PASSED!" << std::endl;
     else
@@ -245,6 +318,78 @@ int run_test()
       return EXIT_FAILURE;
     }
 
+    std::cout << "Add to matrix: ";
+    ublas_B = ublas_A_sub2 - ublas_A_sub2;
+    vcl_B = vcl_A_sub2 - vcl_A_sub2;
+
+    if (check_for_equality(ublas_B, vcl_B))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test 5: Scaling //////////" << std::endl;
+    std::cout << "//" << std::endl;
+    viennacl::copy(ublas_A, vcl_A);
+    
+    std::cout << "Multiplication with CPU scalar: ";
+    ublas_A_sub2 *= ScalarType(3.1415);
+    vcl_A_sub2 *= ScalarType(3.1415);
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Multiplication with GPU scalar: ";
+    ublas_A_sub2 *= gpu_pi;
+    vcl_A_sub2 *= gpu_pi;
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    
+    std::cout << "Division with CPU scalar: ";
+    ublas_A_sub2 /= ScalarType(3.1415);
+    vcl_A_sub2 /= ScalarType(3.1415);
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Division with GPU scalar: ";
+    ublas_A_sub2 /= gpu_pi;
+    vcl_A_sub2 /= gpu_pi;
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+    
+    
+
+    std::cout << "//" << std::endl;
+    std::cout << "////////// Test 6: Products //////////" << std::endl;
+    std::cout << "//" << std::endl;
+
     std::cout << "Inplace add of prod(): ";
     ublas_A_sub1 += prod(ublas_C_sub, ublas_D_sub);
     vcl_A_sub1 += viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
@@ -257,6 +402,53 @@ int run_test()
       return EXIT_FAILURE;
     }
 
+    std::cout << "Assigned C = A * B: ";
+    ublas_A_sub1 = prod(ublas_C_sub, ublas_D_sub);
+    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, vcl_D_sub);
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Assigned C = A^T * B: ";
+    ublas_A_sub1 = prod(trans(ublas_C_sub), ublas_D_sub);
+    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), vcl_D_sub);
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Assigned C = A * B^T: ";
+    ublas_A_sub1 = prod(ublas_C_sub, trans(ublas_D_sub));
+    vcl_A_sub1 = viennacl::linalg::prod(vcl_C_sub, trans(vcl_D_sub));
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
+
+    std::cout << "Assigned C = A^T * B^T: ";
+    ublas_A_sub1 = prod(trans(ublas_C_sub), trans(ublas_D_sub));
+    vcl_A_sub1 = viennacl::linalg::prod(trans(vcl_C_sub), trans(vcl_D_sub));
+
+    if (check_for_equality(ublas_A, vcl_A))
+      std::cout << "PASSED!" << std::endl;
+    else
+    {
+      std::cout << std::endl << "TEST failed!";
+      return EXIT_FAILURE;
+    }
 
     return EXIT_SUCCESS;
 }    
diff --git a/tests/src/scalar.cpp b/tests/src/scalar.cpp
index 5475d37..09f385b 100644
--- a/tests/src/scalar.cpp
+++ b/tests/src/scalar.cpp
@@ -44,8 +44,8 @@ int test(Epsilon const& epsilon)
 {
    int retval = EXIT_SUCCESS;
 
-   NumericT s1 = 3.1415926;
-   NumericT s2 = 2.71763;
+   NumericT s1 = NumericT(3.1415926);
+   NumericT s2 = NumericT(2.71763);
    int s3 = 42;
 
    viennacl::scalar<NumericT> vcl_s1;
@@ -185,7 +185,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-5;
+      NumericT epsilon = NumericT(1.0E-5);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
@@ -200,7 +200,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-6;
+      NumericT epsilon = NumericT(1.0E-6);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
@@ -215,7 +215,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-7;
+      NumericT epsilon = NumericT(1.0E-7);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
diff --git a/tests/src/sparse.cpp b/tests/src/sparse.cpp
index 85298c5..ab60b43 100644
--- a/tests/src/sparse.cpp
+++ b/tests/src/sparse.cpp
@@ -288,8 +288,8 @@ int test(Epsilon const& epsilon)
    
    // --------------------------------------------------------------------------            
    // --------------------------------------------------------------------------            
-   NumericT alpha(2.786);
-   NumericT beta(1.432);
+   NumericT alpha = static_cast<NumericT>(2.786);
+   NumericT beta = static_cast<NumericT>(1.432);
    copy(rhs.begin(), rhs.end(), vcl_rhs.begin());
    copy(result.begin(), result.end(), vcl_result.begin());
    copy(result.begin(), result.end(), vcl_result2.begin());
@@ -339,7 +339,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 5.0E-2;
+      NumericT epsilon = static_cast<NumericT>(5.0E-2);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
diff --git a/tests/src/structured-matrices.cpp b/tests/src/structured-matrices.cpp
index 2b240b8..9227903 100644
--- a/tests/src/structured-matrices.cpp
+++ b/tests/src/structured-matrices.cpp
@@ -93,7 +93,7 @@ ScalarType diff(dense_matrix<ScalarType> const & m1, dense_matrix<ScalarType> co
     if ( (d1 == 0) && (d2 == 0) )
       return 0;
     
-    return sqrt(df / fmax(d1, d2));
+    return sqrt(df / std::max<ScalarType>(d1, d2));
 }
 
 
@@ -121,8 +121,8 @@ ScalarType diff_max(std::vector<ScalarType>& vec, std::vector<ScalarType>& ref)
   
   for (std::size_t i = 0; i < vec.size(); i++) 
   {
-    df = fmax(fabs(vec[i] - ref[i]), df);
-    mx = fmax(fabs(vec[i]), mx);
+    df = std::max<ScalarType>(fabs(vec[i] - ref[i]), df);
+    mx = std::max<ScalarType>(fabs(vec[i]), mx);
     
     if (mx > 0)
     {
@@ -425,7 +425,7 @@ int vandermonde_test(ScalarType epsilon)
     //
     // Per-Element access:
     //
-    vcl_vandermonde1(4) = 1.0001;
+    vcl_vandermonde1(4) = static_cast<ScalarType>(1.0001);
     
     for(std::size_t j = 0; j < m1.size2(); j++) 
     {
diff --git a/tests/src/vector.cpp b/tests/src/vector.cpp
index 091f361..5411028 100644
--- a/tests/src/vector.cpp
+++ b/tests/src/vector.cpp
@@ -238,7 +238,7 @@ int test(Epsilon const& epsilon, std::string rhsfile, std::string resultfile)
    viennacl::copy(rhs, vcl_rhs);
    
    std::cout << "Testing cpu_assignments..." << std::endl;
-   NumericT val = 1e-3;
+   NumericT val = static_cast<NumericT>(1e-3);
    for (size_t i=0; i < rhs.size(); ++i)
      rhs(i) = val;
 
@@ -640,7 +640,7 @@ int main()
    std::cout << std::endl;
    {
       typedef float NumericT;
-      NumericT epsilon = 1.0E-4;
+      NumericT epsilon = static_cast<NumericT>(1.0E-4);
       std::cout << "# Testing setup:" << std::endl;
       std::cout << "  eps:     " << epsilon << std::endl;
       std::cout << "  numeric: float" << std::endl;
diff --git a/tests/src/vector_range.cpp b/tests/src/vector_range.cpp
index 80c4eea..3a6ab05 100644
--- a/tests/src/vector_range.cpp
+++ b/tests/src/vector_range.cpp
@@ -25,7 +25,6 @@
 #include <cmath>
 #include <algorithm>
 #include <stdio.h>
-#include <sys/time.h>
 #include <time.h>
 //#include "../benchmarks/benchmark-utils.hpp"
 #include "viennacl/scalar.hpp"
@@ -78,11 +77,11 @@ int run_test()
     //setup ublas objects:
     VectorType ublas_v1(dim_large);
     for (std::size_t i=0; i<ublas_v1.size(); ++i)
-      ublas_v1(i) = i+1;
+      ublas_v1(i) = static_cast<ScalarType>(i+1);
 
     VectorType ublas_v2(dim_small);
     for (std::size_t i=0; i<ublas_v2.size(); ++i)
-      ublas_v2(i) = dim_large + i;
+      ublas_v2(i) = static_cast<ScalarType>(dim_large + i);
       
     boost::numeric::ublas::range ublas_r1(0, dim_small);
     boost::numeric::ublas::range ublas_r2(dim_small - 1, 2*dim_small - 1);
diff --git a/viennacl/circulant_matrix.hpp b/viennacl/circulant_matrix.hpp
index 9033170..c029ede 100644
--- a/viennacl/circulant_matrix.hpp
+++ b/viennacl/circulant_matrix.hpp
@@ -2,7 +2,7 @@
 #define _VIENNACL_CIRCULANT_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/compressed_matrix.hpp b/viennacl/compressed_matrix.hpp
index d74b0a3..d2b0cf8 100644
--- a/viennacl/compressed_matrix.hpp
+++ b/viennacl/compressed_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_COMPRESSED_MATRIX_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -31,6 +31,7 @@
 #include "viennacl/linalg/compressed_matrix_operations.hpp"
 
 #include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
 
 namespace viennacl
 {
@@ -120,14 +121,14 @@ namespace viennacl
     //adapted for std::vector< std::map < > > argument:
     /** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
     *
-    * @param cpu_matrix   A sparse matrix on the host.
+    * @param cpu_matrix   A sparse square matrix on the host using STL types
     * @param gpu_matrix   A compressed_matrix from ViennaCL
     */
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix,
                      compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
     {
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix), gpu_matrix);
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix, cpu_matrix.size(), cpu_matrix.size()), gpu_matrix);
     }
     
     #ifdef VIENNACL_HAVE_EIGEN
@@ -141,7 +142,7 @@ namespace viennacl
         for (typename Eigen::SparseMatrix<SCALARTYPE, flags>::InnerIterator it(eigen_matrix, k); it; ++it)
           stl_matrix[it.row()][it.col()] = it.value();
         
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix), gpu_matrix);
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, eigen_matrix.rows(), eigen_matrix.cols()), gpu_matrix);
     }
     #endif
     
@@ -178,7 +179,7 @@ namespace viennacl
         for (ic_type icursor(mtl::begin<mtl::tag::nz>(cursor)), icend(mtl::end<mtl::tag::nz>(cursor)); icursor != icend; ++icursor)
           stl_matrix[row(*icursor)][col(*icursor)] = value(*icursor);
       
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix), gpu_matrix);
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(stl_matrix, cpu_matrix.num_rows(), cpu_matrix.num_cols()), gpu_matrix);
     }
     #endif
     
@@ -216,11 +217,11 @@ namespace viennacl
         //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
         
         cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(), CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(), CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(), CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(), CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         viennacl::ocl::get_queue().finish();
         
@@ -254,7 +255,7 @@ namespace viennacl
     void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
               std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
     {
-      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix);
+      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
       copy(gpu_matrix, temp);
     }
     
@@ -276,13 +277,13 @@ namespace viennacl
         std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
         
         cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(),
                                   CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(),
                                   CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(),
                                   CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         viennacl::ocl::get_queue().finish();
@@ -324,13 +325,13 @@ namespace viennacl
         std::vector<SCALARTYPE> elements(gpu_matrix.nnz());
         
         cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle1(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle1().get(),
                                   CL_TRUE, 0, sizeof(cl_uint)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle2(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle2().get(),
                                   CL_TRUE, 0, sizeof(cl_uint)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(),
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(),
                                   CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         viennacl::ocl::get_queue().finish();
@@ -444,9 +445,9 @@ namespace viennacl
           _elements = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * new_nonzeros);
           
           cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), _col_buffer_old, _col_buffer, 0, 0, sizeof(cl_uint)*_nonzeros, 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), _col_buffer_old.get(), _col_buffer.get(), 0, 0, sizeof(cl_uint)*_nonzeros, 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), _elements_old, _elements, 0, 0, sizeof(SCALARTYPE)*_nonzeros, 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), _elements_old.get(), _elements.get(), 0, 0, sizeof(SCALARTYPE)*_nonzeros, 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
 
           _nonzeros = new_nonzeros;
@@ -500,6 +501,84 @@ namespace viennacl
           _cols = new_size2;
         }
       }
+      
+      /** @brief Returns a reference to the (i,j)-th entry of the sparse matrix. If (i,j) does not exist (zero), it is inserted (slow!) */
+      entry_proxy<SCALARTYPE> operator()(std::size_t i, std::size_t j)
+      {
+        assert( (i < _rows) && (j < _cols) && "compressed_matrix access out of bounds!");
+        
+        std::size_t index = element_index(i, j);
+        
+        // check for element in sparsity pattern
+        if (index < _nonzeros)
+          return entry_proxy<SCALARTYPE>(index, _elements);
+
+        // Element not found. Copying required. Very slow, but direct entry manipulation is painful anyway...
+        std::vector< std::map<unsigned int, SCALARTYPE> > cpu_backup(_rows);
+        viennacl::copy(*this, cpu_backup);
+        cpu_backup[i][j] = 0.0;
+        viennacl::copy(cpu_backup, *this);
+        
+        index = element_index(i, j);
+        
+        assert(index < _nonzeros);
+        
+        return entry_proxy<SCALARTYPE>(index, _elements);        
+      }
+      /*void operator()(std::size_t i, std::size_t j, SCALARTYPE new_entry)
+      {
+        //read row indices
+        std::vector<cl_uint> row_indices(2);
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          _row_buffer.get(), //row handle
+                                          CL_TRUE, //blocking
+                                          sizeof(cl_uint)*i, //offset
+                                          sizeof(cl_uint)*2, //size
+                                          &(row_indices[0]), //destination
+                                          0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+
+        //get column indices for row i:
+        std::vector<cl_uint> col_indices(row_indices[1] - row_indices[0]);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                  _col_buffer.get(), //col handle
+                                  CL_TRUE, //blocking
+                                  sizeof(cl_uint)*row_indices[0], //offset
+                                  sizeof(cl_uint)*col_indices.size(), //size
+                                  &(col_indices[0]), //destination
+                                  0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+
+        //get entries for row i:
+        std::vector<SCALARTYPE> row_entries(row_indices[1] - row_indices[0]);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                  _elements.get(), //entry handle
+                                  CL_TRUE, //blocking
+                                  sizeof(SCALARTYPE)*row_indices[0], //offset
+                                  sizeof(SCALARTYPE)*row_entries.size(), //size
+                                  &(row_entries[0]), //destination
+                                  0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+        
+        
+        // update entries:
+        for (std::size_t k=0; k<col_indices.size(); ++k)
+        {
+          if (col_indices[k] == j)
+            row_entries[k] = new_entry;
+        }
+        
+        // write back:
+        err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                   _elements.get(),
+                                   CL_TRUE,
+                                   sizeof(SCALARTYPE)*row_indices[0], //offset
+                                   sizeof(SCALARTYPE)*row_entries.size(), //size
+                                   &(row_entries[0]), //data ptr
+                                   0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+      }*/
+      
 
       /** @brief  Returns the number of rows */
       const std::size_t & size1() const { return _rows; }
@@ -516,6 +595,52 @@ namespace viennacl
       const viennacl::ocl::handle<cl_mem> & handle() const { return _elements; }
       
     private:
+      
+      std::size_t element_index(std::size_t i, std::size_t j)
+      {
+        //read row indices
+        std::vector<cl_uint> row_indices(2);
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          _row_buffer.get(), //row handle
+                                          CL_TRUE, //blocking
+                                          sizeof(cl_uint)*i, //offset
+                                          sizeof(cl_uint)*2, //size
+                                          &(row_indices[0]), //destination
+                                          0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+
+        //get column indices for row i:
+        std::vector<cl_uint> col_indices(row_indices[1] - row_indices[0]);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                  _col_buffer.get(), //col handle
+                                  CL_TRUE, //blocking
+                                  sizeof(cl_uint)*row_indices[0], //offset
+                                  sizeof(cl_uint)*col_indices.size(), //size
+                                  &(col_indices[0]), //destination
+                                  0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+
+        //get entries for row i:
+        std::vector<SCALARTYPE> row_entries(row_indices[1] - row_indices[0]);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                  _elements.get(), //entry handle
+                                  CL_TRUE, //blocking
+                                  sizeof(SCALARTYPE)*row_indices[0], //offset
+                                  sizeof(SCALARTYPE)*row_entries.size(), //size
+                                  &(row_entries[0]), //destination
+                                  0, NULL, NULL);
+        VIENNACL_ERR_CHECK(err);
+
+        for (std::size_t k=0; k<col_indices.size(); ++k)
+        {
+          if (col_indices[k] == j)
+            return row_indices[0] + k;
+        }
+        
+        // if not found, return index past the end of the matrix (cf. matrix.end() in the spirit of the STL)
+        return _nonzeros;
+      }
+      
       // /** @brief Copy constructor is by now not available. */
       //compressed_matrix(compressed_matrix const &);
       
diff --git a/viennacl/coordinate_matrix.hpp b/viennacl/coordinate_matrix.hpp
index 14f0b96..7878e9b 100644
--- a/viennacl/coordinate_matrix.hpp
+++ b/viennacl/coordinate_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_COORDINATE_MATRIX_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -112,14 +112,14 @@ namespace viennacl
 
     /** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
     *
-    * @param cpu_matrix   A sparse matrix on the host.
+    * @param cpu_matrix   A sparse square matrix on the host.
     * @param gpu_matrix   A coordinate_matrix from ViennaCL
     */
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     void copy(const std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix,
                      coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix )
     {
-      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix), gpu_matrix);
+      copy(tools::const_sparse_matrix_adapter<SCALARTYPE>(cpu_matrix, cpu_matrix.size(), cpu_matrix.size()), gpu_matrix);
     }
     
     //gpu to cpu:
@@ -147,9 +147,9 @@ namespace viennacl
         //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
         
         cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle12(), CL_TRUE, 0, sizeof(cl_uint)* 2 *gpu_matrix.nnz(), &(coord_buffer[0]), 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle12().get(), CL_TRUE, 0, sizeof(cl_uint)* 2 *gpu_matrix.nnz(), &(coord_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         viennacl::ocl::get_queue().finish();
         
@@ -170,7 +170,7 @@ namespace viennacl
     void copy(const coordinate_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix,
               std::vector< std::map<unsigned int, SCALARTYPE> > & cpu_matrix)
     {
-      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix);
+      tools::sparse_matrix_adapter<SCALARTYPE> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
       copy(gpu_matrix, temp);
     }
 
@@ -221,16 +221,16 @@ namespace viennacl
           elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE) * internal_nnz());
           
           cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), coord_buffer_old, coord_buffer_, 0, 0, sizeof(cl_uint) * 2 * nonzeros_, 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), coord_buffer_old.get(), coord_buffer_.get(), 0, 0, sizeof(cl_uint) * 2 * nonzeros_, 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), elements_old, elements_, 0, 0, sizeof(SCALARTYPE)*nonzeros_, 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_old.get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*nonzeros_, 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
 
           //new memory must be padded with zeros:
           std::vector<long> temp(internal_nnz() - nonzeros_);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), coord_buffer_old, coord_buffer_, 0, nonzeros_, sizeof(cl_uint) * 2 * temp.size(), 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), coord_buffer_old.get(), coord_buffer_.get(), 0, nonzeros_, sizeof(cl_uint) * 2 * temp.size(), 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), elements_old, elements_, 0, nonzeros_, sizeof(SCALARTYPE)*temp.size(), 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), elements_old.get(), elements_.get(), 0, nonzeros_, sizeof(SCALARTYPE)*temp.size(), 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
         }
       }
diff --git a/viennacl/fft.hpp b/viennacl/fft.hpp
index 9706431..d5efbff 100644
--- a/viennacl/fft.hpp
+++ b/viennacl/fft.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_FFT_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/forwards.h b/viennacl/forwards.h
index ee41201..ea57b1a 100644
--- a/viennacl/forwards.h
+++ b/viennacl/forwards.h
@@ -2,7 +2,7 @@
 #define VIENNACL_FORWARDS_H
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -23,7 +23,7 @@
 */
 
 /**
- @mainpage Source Code Documentation for ViennaCL 1.2.0
+ @mainpage Source Code Documentation for ViennaCL 1.2.1
 
  This is the source code documentation of ViennaCL. Detailed information about the functions in ViennaCL can be found here.
  
diff --git a/viennacl/hankel_matrix.hpp b/viennacl/hankel_matrix.hpp
index 8968010..9c2c8e3 100644
--- a/viennacl/hankel_matrix.hpp
+++ b/viennacl/hankel_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_HANKEL_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/io/kernel_parameters.hpp b/viennacl/io/kernel_parameters.hpp
index 6a81f25..21f4152 100644
--- a/viennacl/io/kernel_parameters.hpp
+++ b/viennacl/io/kernel_parameters.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_IO_KERNEL_PARAMETERS_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/io/matrix_market.hpp b/viennacl/io/matrix_market.hpp
index 8c4b6f4..823f553 100644
--- a/viennacl/io/matrix_market.hpp
+++ b/viennacl/io/matrix_market.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_IO_MATRIX_MARKET_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/amg.hpp b/viennacl/linalg/amg.hpp
index f055bed..6d82530 100755
--- a/viennacl/linalg/amg.hpp
+++ b/viennacl/linalg/amg.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_AMG_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/bicgstab.hpp b/viennacl/linalg/bicgstab.hpp
index 5817d34..82ca73a 100644
--- a/viennacl/linalg/bicgstab.hpp
+++ b/viennacl/linalg/bicgstab.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_BICGSTAB_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/cg.hpp b/viennacl/linalg/cg.hpp
index 11be052..c7b7b57 100644
--- a/viennacl/linalg/cg.hpp
+++ b/viennacl/linalg/cg.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_CG_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/circulant_matrix_operations.hpp b/viennacl/linalg/circulant_matrix_operations.hpp
index d16d040..49d42d1 100644
--- a/viennacl/linalg/circulant_matrix_operations.hpp
+++ b/viennacl/linalg/circulant_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -124,7 +124,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/compressed_matrix_operations.hpp b/viennacl/linalg/compressed_matrix_operations.hpp
index ada8472..f31af4b 100644
--- a/viennacl/linalg/compressed_matrix_operations.hpp
+++ b/viennacl/linalg/compressed_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_COMPRESSED_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -127,7 +127,7 @@ namespace viennacl
       unsigned int threads = k.local_work_size();
       
       k.global_work_size(k.local_work_size());
-      viennacl::ocl::enqueue(k(U.handle1(), U.handle2(), U.handle(),
+      viennacl::ocl::enqueue(k(U.handle1().get(), U.handle2().get(), U.handle().get(),
                                                               viennacl::ocl::local_mem(sizeof(int) * (threads+2)),
                                                               viennacl::ocl::local_mem(sizeof(SCALARTYPE) * (threads+2)),
                                                               vec, U.size1()));        
@@ -171,7 +171,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/coordinate_matrix_operations.hpp b/viennacl/linalg/coordinate_matrix_operations.hpp
index ff5f8ac..8b46b7f 100644
--- a/viennacl/linalg/coordinate_matrix_operations.hpp
+++ b/viennacl/linalg/coordinate_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_COORDINATE_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -128,7 +128,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/detail/amg/amg_base.hpp b/viennacl/linalg/detail/amg/amg_base.hpp
index 9e58137..25cd084 100644
--- a/viennacl/linalg/detail/amg/amg_base.hpp
+++ b/viennacl/linalg/detail/amg/amg_base.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -419,9 +419,9 @@ namespace viennacl
               */
             amg_sparsematrix (unsigned int i, unsigned int j)
             {
-              AdapterType a (internal_mat);
+              AdapterType a (internal_mat, i, j);
               a.resize (i,j,false);
-              AdapterType a_trans (internal_mat_trans);
+              AdapterType a_trans (internal_mat_trans, j, i);
               a_trans.resize (j,i,false);
               s1 = i;
               s2 = j;
@@ -437,8 +437,8 @@ namespace viennacl
             */
             amg_sparsematrix (std::vector<std::map<unsigned int, ScalarType> > const & mat)
             {  
-              AdapterType a (internal_mat);
-              AdapterType a_trans (internal_mat_trans);
+              AdapterType a (internal_mat, mat.size(), mat.size());
+              AdapterType a_trans (internal_mat_trans, mat.size(), mat.size());
               a.resize(mat.size(), mat.size());
               a_trans.resize(mat.size(), mat.size());
               
@@ -456,8 +456,8 @@ namespace viennacl
             template <typename MatrixType>
             amg_sparsematrix (MatrixType const & mat)
             {  
-              AdapterType a (internal_mat);
-              AdapterType a_trans (internal_mat_trans);
+              AdapterType a (internal_mat, mat.size1(), mat.size2());
+              AdapterType a_trans (internal_mat_trans, mat.size2(), mat.size1());
               a.resize(mat.size1(), mat.size2());
               a_trans.resize (mat.size2(), mat.size1());
               s1 = mat.size1();
@@ -641,9 +641,9 @@ namespace viennacl
             
             void clear() 
             {
-              AdapterType a (internal_mat);
+              AdapterType a (internal_mat, s1, s2);
               a.clear();
-              AdapterType a_trans (internal_mat_trans);
+              AdapterType a_trans (internal_mat_trans, s2, s1);
               a_trans.clear();
               transposed = true;
             }
@@ -684,13 +684,13 @@ namespace viennacl
             {
               if (!trans && !transposed_mode)
               {
-                AdapterType a (internal_mat);
+                AdapterType a (internal_mat, s1, s2);
                 return a.begin1();
               }
               else
               {
                 do_trans();
-                AdapterType a_trans (internal_mat_trans);
+                AdapterType a_trans (internal_mat_trans, s2, s1);
                 return a_trans.begin1();
               }
             }
@@ -699,13 +699,13 @@ namespace viennacl
             {
               if (!trans && !transposed_mode)
               {
-                AdapterType a (internal_mat);
+                AdapterType a (internal_mat, s1, s2);
                 return a.end1();
               }
               else
               {
                 //do_trans();
-                AdapterType a_trans (internal_mat_trans);
+                AdapterType a_trans (internal_mat_trans, s2, s1);
                 return a_trans.end1();
               }
             }
@@ -714,13 +714,13 @@ namespace viennacl
             {
               if (!trans && !transposed_mode)
               {
-                AdapterType a (internal_mat);
+                AdapterType a (internal_mat, s1, s2);
                 return a.begin2();
               }
               else
               {
                 do_trans();
-                AdapterType a_trans (internal_mat_trans);
+                AdapterType a_trans (internal_mat_trans, s2, s1);
                 return a_trans.begin2();
               }
             }
@@ -729,13 +729,13 @@ namespace viennacl
             {
               if (!trans && !transposed_mode)
               {
-                AdapterType a (internal_mat);
+                AdapterType a (internal_mat, s1, s2);
                 return a.end2();
               }
               else
               {
                 //do_trans();
-                AdapterType a_trans (internal_mat_trans);
+                AdapterType a_trans (internal_mat_trans, s2, s1);
                 return a_trans.end2();
               }
             }
@@ -744,28 +744,28 @@ namespace viennacl
             {
               // Const_iterator of transposed can only be used if transposed matrix is already built and up to date.
               assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
-                    ConstAdapterType a_const (internal_mat);
+                    ConstAdapterType a_const (internal_mat, s1, s2);
               return a_const.begin1();
             }
             
             const_iterator1 end1(bool trans = false) const
             {
               assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
-              ConstAdapterType a_const (internal_mat);
+              ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.end1();
             }
             
             const_iterator2 begin2(bool trans = false) const
             {
               assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
-              ConstAdapterType a_const (internal_mat);
+              ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.begin2();
             }
             
             const_iterator2 end2(bool trans = false) const
             {
               assert((!transposed_mode || (transposed_mode && transposed)) && "Error: Cannot build const_iterator when transposed has not been built yet!");
-              ConstAdapterType a_const (internal_mat);
+              ConstAdapterType a_const (internal_mat, trans ? s2 : s1, trans ? s1 : s2);
               return a_const.end2();
             }
             
diff --git a/viennacl/linalg/detail/amg/amg_coarse.hpp b/viennacl/linalg/detail/amg/amg_coarse.hpp
index 8b56dce..3a73534 100644
--- a/viennacl/linalg/detail/amg/amg_coarse.hpp
+++ b/viennacl/linalg/detail/amg/amg_coarse.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_COARSE_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/amg/amg_debug.hpp b/viennacl/linalg/detail/amg/amg_debug.hpp
index fe22729..9fc3e26 100644
--- a/viennacl/linalg/detail/amg/amg_debug.hpp
+++ b/viennacl/linalg/detail/amg/amg_debug.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_DEBUG_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/amg/amg_interpol.hpp b/viennacl/linalg/detail/amg/amg_interpol.hpp
index d397c15..bcd574b 100644
--- a/viennacl/linalg/detail/amg/amg_interpol.hpp
+++ b/viennacl/linalg/detail/amg/amg_interpol.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_AMG_AMG_INTERPOL_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/block_matrix.hpp b/viennacl/linalg/detail/spai/block_matrix.hpp
index 2e35ff8..ce56486 100644
--- a/viennacl/linalg/detail/spai/block_matrix.hpp
+++ b/viennacl/linalg/detail/spai/block_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/block_vector.hpp b/viennacl/linalg/detail/spai/block_vector.hpp
index 6d78cea..2e85b05 100644
--- a/viennacl/linalg/detail/spai/block_vector.hpp
+++ b/viennacl/linalg/detail/spai/block_vector.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/fspai.hpp b/viennacl/linalg/detail/spai/fspai.hpp
index e33b31b..85c3d29 100644
--- a/viennacl/linalg/detail/spai/fspai.hpp
+++ b/viennacl/linalg/detail/spai/fspai.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/qr.hpp b/viennacl/linalg/detail/spai/qr.hpp
index bc80f71..bedd135 100644
--- a/viennacl/linalg/detail/spai/qr.hpp
+++ b/viennacl/linalg/detail/spai/qr.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/small_matrix.hpp b/viennacl/linalg/detail/spai/small_matrix.hpp
index fbb0ce4..20ff736 100644
--- a/viennacl/linalg/detail/spai/small_matrix.hpp
+++ b/viennacl/linalg/detail/spai/small_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/spai-dynamic.hpp b/viennacl/linalg/detail/spai/spai-dynamic.hpp
index 734ddc3..a5e8063 100644
--- a/viennacl/linalg/detail/spai/spai-dynamic.hpp
+++ b/viennacl/linalg/detail/spai/spai-dynamic.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/spai-static.hpp b/viennacl/linalg/detail/spai/spai-static.hpp
index f4a14ff..8680098 100644
--- a/viennacl/linalg/detail/spai/spai-static.hpp
+++ b/viennacl/linalg/detail/spai/spai-static.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/spai.hpp b/viennacl/linalg/detail/spai/spai.hpp
index a40a9bd..7466aba 100644
--- a/viennacl/linalg/detail/spai/spai.hpp
+++ b/viennacl/linalg/detail/spai/spai.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -314,8 +314,8 @@ namespace viennacl
                                              //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
                                              static_cast<unsigned int>(M_v.size())));
             //copy vector m_v back from GPU to CPU
-            cl_int vcl_err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                                 m_v_vcl.handle(), CL_TRUE, 0, 
+            cl_int vcl_err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                                 m_v_vcl.handle().get(), CL_TRUE, 0, 
                                                  sizeof(ScalarType)*(m_v.size()),
                                                  &(m_v[0]), 0, NULL, NULL);
             VIENNACL_ERR_CHECK(vcl_err);
diff --git a/viennacl/linalg/detail/spai/spai_tag.hpp b/viennacl/linalg/detail/spai/spai_tag.hpp
index 046aaac..be73102 100644
--- a/viennacl/linalg/detail/spai/spai_tag.hpp
+++ b/viennacl/linalg/detail/spai/spai_tag.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/detail/spai/sparse_vector.hpp b/viennacl/linalg/detail/spai/sparse_vector.hpp
index b422ae6..a832a58 100644
--- a/viennacl/linalg/detail/spai/sparse_vector.hpp
+++ b/viennacl/linalg/detail/spai/sparse_vector.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/direct_solve.hpp b/viennacl/linalg/direct_solve.hpp
index bcbad1c..2edb387 100644
--- a/viennacl/linalg/direct_solve.hpp
+++ b/viennacl/linalg/direct_solve.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_DIRECT_SOLVE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/gmres.hpp b/viennacl/linalg/gmres.hpp
index 670d5ae..ca3d704 100644
--- a/viennacl/linalg/gmres.hpp
+++ b/viennacl/linalg/gmres.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_GMRES_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/hankel_matrix_operations.hpp b/viennacl/linalg/hankel_matrix_operations.hpp
index d0112cd..1f9ca40 100644
--- a/viennacl/linalg/hankel_matrix_operations.hpp
+++ b/viennacl/linalg/hankel_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -114,7 +114,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/ilu.hpp b/viennacl/linalg/ilu.hpp
index addcc0c..13681b7 100644
--- a/viennacl/linalg/ilu.hpp
+++ b/viennacl/linalg/ilu.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_ILU_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -315,14 +315,14 @@ namespace viennacl
         template <typename VectorType>
         void apply(VectorType & vec) const
         {
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
+          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
           viennacl::linalg::ilu_lu_substitute(LU_const_adapter, vec);
         }
         
       private:
         void init(MatrixType const & mat)
         {
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
+          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
           viennacl::linalg::precondition(mat, LU_adapter, _tag);
         }
         
@@ -353,7 +353,7 @@ namespace viennacl
         {
           copy(vec, temp_vec);
           //lu_substitute(LU, vec);
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU);
+          viennacl::tools::const_sparse_matrix_adapter<ScalarType> LU_const_adapter(LU, LU.size(), LU.size());
           viennacl::linalg::ilu_lu_substitute(LU_const_adapter, temp_vec);
           
           copy(temp_vec, vec);
@@ -368,8 +368,8 @@ namespace viennacl
           //copy to cpu:
           copy(mat, temp);
           
-          viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp);
-          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU);
+          viennacl::tools::const_sparse_matrix_adapter<ScalarType>       temp_adapter(temp, temp.size(), temp.size());
+          viennacl::tools::sparse_matrix_adapter<ScalarType>       LU_adapter(LU, LU.size(), LU.size());
           viennacl::linalg::precondition(temp_adapter, LU_adapter, _tag);
           
           temp_vec.resize(mat.size1());
diff --git a/viennacl/linalg/inner_prod.hpp b/viennacl/linalg/inner_prod.hpp
index 36018c2..69b374d 100644
--- a/viennacl/linalg/inner_prod.hpp
+++ b/viennacl/linalg/inner_prod.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_INNER_PROD_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/jacobi_precond.hpp b/viennacl/linalg/jacobi_precond.hpp
index b50408b..4ceac0a 100644
--- a/viennacl/linalg/jacobi_precond.hpp
+++ b/viennacl/linalg/jacobi_precond.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_JACOBI_PRECOND_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/matrix_operations.hpp b/viennacl/linalg/matrix_operations.hpp
index a15e7ab..0b2f7bc 100644
--- a/viennacl/linalg/matrix_operations.hpp
+++ b/viennacl/linalg/matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -28,7 +28,9 @@
 #include "viennacl/scalar.hpp"
 #include "viennacl/vector.hpp"
 #include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
 #include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
 #include "viennacl/traits/size.hpp"
 #include "viennacl/traits/start.hpp"
 #include "viennacl/traits/handle.hpp"
@@ -52,8 +54,88 @@ namespace viennacl
 {
   namespace linalg
   {
+    //
+    ///////////////////////////////////// addition and subtraction///////////////////////////////////////////////
+    //
     
-    /** @brief Adds two dense matrices and writes the result to a third matrix
+    namespace detail
+    {
+      template<class T1, class T2, class T3>
+      typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
+                                   && viennacl::is_matrix<T2>::value 
+                                   && viennacl::is_matrix<T3>::value >::type
+      add_sub_impl(const T1 & mat1, 
+                   const T2 & mat2,
+                         T3 & result,
+                   std::string kernel_name
+                  )
+      {
+        assert(result.size1() == mat1.size1());
+        assert(result.size2() == mat1.size2());
+        assert(result.size1() == mat2.size1());
+        assert(result.size2() == mat2.size2());
+
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
+        
+        std::size_t block_size = 16;
+        
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
+        k.global_work_size(0, block_size*block_size);
+        k.global_work_size(1, block_size*block_size);
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(mat1), 
+                                        cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)), 
+                                        cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
+                                        cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
+                                viennacl::traits::handle(mat2), 
+                                        cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)), 
+                                        cl_uint(viennacl::traits::size1(mat2)),            cl_uint(viennacl::traits::size2(mat2)),
+                                        cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
+                                viennacl::traits::handle(result), 
+                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
+                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
+                                )
+                              );        
+      }
+      
+
+
+      template <typename T1, typename T2>
+      typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                    && viennacl::is_matrix<T2>::value
+                                  >::type
+      inplace_add_sub_impl(T1 & result, T2 const & mat2, std::string kernel_name)
+      {
+        assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
+        assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
+
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
+        
+        std::size_t block_size = 16;
+        
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
+        k.global_work_size(0, block_size*block_size);
+        k.global_work_size(1, block_size*block_size);
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
+                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
+                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
+                                viennacl::traits::handle(mat2), 
+                                        cl_uint(viennacl::traits::start1(mat2)),            cl_uint(viennacl::traits::start2(mat2)), 
+                                        cl_uint(viennacl::traits::size1(mat2)),             cl_uint(viennacl::traits::size2(mat2)),
+                                        cl_uint(viennacl::traits::internal_size1(mat2)),    cl_uint(viennacl::traits::internal_size2(mat2))
+                                )
+                              );
+      }
+      
+    }
+    
+    /** @brief Adds two dense matrices or submatrices and writes the result to a third matrix or submatrix
     *
     * This is the implementation of the convenience expression result = mat1 + mat2;
     *
@@ -61,123 +143,36 @@ namespace viennacl
     * @param mat2   The right hand side operand
     * @param result The resulting matrix
     */
-    template<class TYPE, typename F, unsigned int ALIGNMENT>
-    void add(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat1, 
-             const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2,
-             viennacl::matrix<TYPE, F, ALIGNMENT> & result)
+    template<class T1, class T2, class T3>
+    typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
+                                 && viennacl::is_matrix<T2>::value 
+                                 && viennacl::is_matrix<T3>::value >::type
+    add(const T1 & mat1, 
+        const T2 & mat2,
+              T3 & result)
     {
-      assert(result.size1() == mat1.size1());
-      assert(result.size2() == mat1.size2());
-      assert(result.size1() == mat2.size1());
-      assert(result.size2() == mat2.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "add");
-      assert( (mat1.internal_size() == mat2.internal_size())
-             && "Operands must have same dimension and memory layout in this version of ViennaCL!");
-      cl_uint size = std::min(mat1.internal_size(), mat2.internal_size());
-
-      viennacl::ocl::enqueue(k(mat1, mat2, result, size));        
+      detail::add_sub_impl(mat1, mat2, result, "add");
     }
 
-    /** @brief Adds a dense matrix to another
+    /** @brief Adds a dense matrix or submatrix to another
     *
     * This is the implementation of the convenience expression result += mat1;
     *
     * @param mat2   The addend (either a matrix or a matrix_range)
     * @param result The resulting matrix  (either a matrix or a matrix_range)
     */
-    template <typename M1, typename M2>
-    typename viennacl::enable_if< viennacl::is_matrix<M1>::value
-                                  && viennacl::is_matrix<M2>::value
+    template <typename T1, typename T2>
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
                                 >::type
-    inplace_add(M1 & result, M2 const & mat2)
+    inplace_add(T1 & result, T2 const & mat2)
     {
-      assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
-      assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< M1 >::ResultType    KernelClass;
-      
-      size_t block_size = 15;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_add");
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(result), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(result), block_size));
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      
-      viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
-                                       cl_uint(viennacl::traits::start1(result)), cl_uint(viennacl::traits::start2(result)), 
-                                       cl_uint(viennacl::traits::size1(result)), cl_uint(viennacl::traits::size2(result)),
-                                       cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
-                                viennacl::traits::handle(mat2), 
-                                      cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)), 
-                                      cl_uint(viennacl::traits::size1(mat2)), cl_uint(viennacl::traits::size2(mat2)),
-                                      cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2))
-                              )
-                            );
+      detail::inplace_add_sub_impl(result, mat2, "inplace_add");
     }
 
-    /** @brief Adds a dense matrix to another
-    *
-    * This is the implementation of the convenience expression result += mat1;
-    *
-    * @param mat1   The left hand side operand
-    * @param mat2   The right hand side operand
-    * @param result The resulting matrix
-    */
-    /*
-    template <typename MatrixType>
-    void inplace_add(viennacl::matrix_range<MatrixType> & result, 
-                     const viennacl::matrix_range<MatrixType> & mat2)
-    {
-      assert(result.size1() == mat2.size1());
-      assert(result.size2() == mat2.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
-      
-      size_t block_size = 15;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_add");
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(result.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(result.size2(), block_size));
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-
-      viennacl::ocl::enqueue(k(result.get(), cl_uint(result.start1()), cl_uint(result.start2()), 
-                                             cl_uint(result.size1()), cl_uint(result.size2()),
-                                             cl_uint(result.get().internal_size1()), cl_uint(result.get().internal_size2()),
-                                mat2.get(), cl_uint(mat2.start1()), cl_uint(mat2.start2()),
-                                            cl_uint(mat2.size1()), cl_uint(mat2.size2()),
-                                            cl_uint(mat2.get().internal_size1()), cl_uint(mat2.get().internal_size2())
-                              )
-                            );
-    } */
-
-    /** @brief Adds a dense matrix to another
-    *
-    * This is the implementation of the convenience expression result += mat1;
-    *
-    * @param mat1   The left hand side operand
-    * @param mat2   The right hand side operand
-    * @param result The resulting matrix
-    */
-    /*
-    template<class TYPE, typename F, unsigned int ALIGNMENT>
-    void inplace_add(viennacl::matrix<TYPE, F, ALIGNMENT> & result, 
-                     const viennacl::matrix_range<viennacl::matrix<TYPE, F, ALIGNMENT> > & mat2)
-    {
-      viennacl::range r1(0, result.size1());
-      viennacl::range r2(0, result.size2());
-      viennacl::matrix_range<viennacl::matrix<TYPE, F, ALIGNMENT> > result_wrap(result, r1, r2);
-      inplace_add(result_wrap, mat2);
-    } */
 
 
-
-
-    /** @brief Subtracts two dense matrices and writes the result to a third matrix
+    /** @brief Subtracts two dense matrices or submatrices and writes the result to a third matrix or submatrix
     *
     * This is the implementation of the convenience expression result = mat1 - mat2;
     *
@@ -185,106 +180,125 @@ namespace viennacl
     * @param mat2   The right hand side operand
     * @param result The resulting matrix
     */
-    template<class TYPE, typename F, unsigned int ALIGNMENT>
-    void sub(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat1, 
-             const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2,
-             viennacl::matrix<TYPE, F, ALIGNMENT> & result)
+    template<class T1, class T2, class T3>
+    typename viennacl::enable_if<   viennacl::is_matrix<T1>::value 
+                                 && viennacl::is_matrix<T2>::value 
+                                 && viennacl::is_matrix<T3>::value >::type
+    sub(const T1 & mat1, 
+        const T2 & mat2,
+              T3 & result)
     {
-      assert(result.size1() == mat1.size1());
-      assert(result.size2() == mat1.size2());
-      assert(result.size1() == mat2.size1());
-      assert(result.size2() == mat2.size2());
-
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "sub");
-      assert( (mat1.internal_size() == mat2.internal_size())
-             && "Operands must have same dimension and memory layout in this version of ViennaCL!");
-      cl_uint size = std::min(mat1.internal_size(), mat2.internal_size());
-
-      viennacl::ocl::enqueue(k(mat1, mat2, result, size));        
+      detail::add_sub_impl(mat1, mat2, result, "sub");
     }
 
-    /** @brief Subtracts a dense matrix from another
+    /** @brief Subtracts a dense matrix or submatrix from another
     *
-    * This is the implementation of the convenience expression mat1 -= mat2;
+    * This is the implementation of the convenience expression result -= mat1;
     *
-    * @param mat2   The matrix to be subtracted
-    * @param result The resulting matrix
+    * @param mat2   The addend (either a matrix or a matrix_range)
+    * @param result The resulting matrix  (either a matrix or a matrix_range)
     */
-    template<class TYPE, typename F, unsigned int ALIGNMENT>
-    void inplace_sub(viennacl::matrix<TYPE, F, ALIGNMENT> & result, 
-                     const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2)
+    template <typename T1, typename T2>
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
+                                >::type
+    inplace_sub(T1 & result, T2 const & mat2)
     {
-      assert(result.size1() == mat2.size1());
-      assert(result.size2() == mat2.size2());
+      detail::inplace_add_sub_impl(result, mat2, "inplace_sub");
+    }
 
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_sub");
-      assert( (result.internal_size() == mat2.internal_size())
-             && "Operands must have same dimension and memory layout in this version of ViennaCL!");
-      cl_uint size = std::min(result.internal_size(), mat2.internal_size());
 
-      viennacl::ocl::enqueue(k(result, mat2, size));        
+
+
+    //
+    /////////////////////////   inplace multiplication and division /////////////////////////////////
+    //
+
+    namespace detail
+    {
+      template <typename  T1, typename ScalarType>
+      typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
+      inplace_mult_div_impl(T1 & result, 
+                            ScalarType val,
+                            std::string kernel_name)
+      {
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< T1 >::ResultType    KernelClass;
+        
+        std::size_t block_size = 16;
+          
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
+        
+        k.global_work_size(0, block_size*block_size);
+        k.global_work_size(1, block_size*block_size);
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
+                                        cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)), 
+                                        cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
+                                        cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
+                                val)
+                              );
+      }
     }
 
-    /** @brief Multiplies a dense matrix by a scalar
+
+    /** @brief Multiplies a dense matrix or submatrix by a scalar
     *
     * This is the implementation of the convenience expression matrix *= val;
     *
     * @param result The matrix to be manipulated
     * @param val    The CPU scalar by which all entries of the matrix are multiplied
     */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void inplace_mult(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result, 
-                      SCALARTYPE val)
+    template <typename  T1>
+    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
+    inplace_mult(T1 & result, 
+                 typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type val)
     {
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "cpu_inplace_mult");
-      viennacl::ocl::enqueue(k(result, val, cl_uint(result.internal_size())));
+      detail::inplace_mult_div_impl(result, val, "cpu_inplace_mult");
     }
 
 
-    /** @brief Multiplies a dense matrix by a scalar
+    /** @brief Multiplies a dense matrix or submatrix by a scalar
     *
     * This is the implementation of the convenience expression matrix *= val;
     *
     * @param result The matrix to be manipulated
     * @param val    The scalar by which all entries of the matrix are multiplied
     */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void inplace_mult(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result, 
-                      viennacl::scalar<SCALARTYPE> const & val)
+    template <typename  T1>
+    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
+    inplace_mult(T1 & result, 
+                 typename T1::value_type val)
     {
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_mult");
-      viennacl::ocl::enqueue(k(result, val, cl_uint(result.internal_size())));
+      detail::inplace_mult_div_impl(result, val, "inplace_mult");
     }
 
 
 
-    /** @brief Multiplies a dense matrix by a scalar
+    /** @brief Divides a dense matrix or submatrix by a scalar
     *
     * This is the implementation of the convenience expression matrix /= val;
     *
     * @param result The matrix to be manipulated
     * @param val    The scalar by which all entries of the matrix are divided
     */
-    template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
-    void inplace_divide(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result, 
-                        viennacl::scalar<SCALARTYPE> const & val)
+    template <typename  T1>
+    typename viennacl::enable_if< viennacl::is_matrix<T1>::value >::type
+    inplace_divide(T1 & result, 
+                   typename T1::value_type val)
     {
-      typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_divide");
-      unsigned int size = result.internal_size();
-
-      viennacl::ocl::enqueue(k(result, val, size));
+      detail::inplace_mult_div_impl(result, val, "inplace_divide");
     }
 
+
+
+    //
+    /////////////////////////   matrix-vector products /////////////////////////////////
+    //
+
+
+
     // A * x
     /** @brief Returns a proxy class that represents matrix-vector multiplication
     *
@@ -319,7 +333,7 @@ namespace viennacl
     {
       assert(mat.size2() == vec.size());
       // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(vec.handle() != result.handle() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
+      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
       result.resize(mat.size1());
 
       typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType    KernelClass;
@@ -384,7 +398,7 @@ namespace viennacl
     {
       assert(mat.size1() == vec.size());  //remember: mat is transposed!
       // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
-      assert(vec.handle() != result.handle() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
+      assert(vec.handle().get() != result.handle().get() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
       result.resize(mat.size2());
 
       typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
@@ -397,209 +411,111 @@ namespace viennacl
 
 
 
-    /** @brief Carries out matrix-matrix multiplication
-    *
-    * Implementation of C = prod(A, B);
-    *
-    */
-    template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
-    void prod_impl(const viennacl::matrix<TYPE, F1, ALIGNMENT> & A, 
-                    const viennacl::matrix<TYPE, F2, ALIGNMENT> & B, 
-                          viennacl::matrix<TYPE, F3, ALIGNMENT> & C, 
-                          int block_size = 15) // [JW] added ability to set block size from outside ..
-    {
-      assert(A.size1() == C.size1());
-      assert(A.size2() == B.size1());
-      assert(B.size2() == C.size2());
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.handle() != A.handle() 
-             && C.handle() != B.handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F2, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AA");
-      
-      /*k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1() / 2, block_size / 2));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2() / 2, block_size / 2));
-      k.local_work_size(0, block_size / 2);
-      k.local_work_size(1, block_size / 2);*/
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      
-      viennacl::ocl::enqueue(
-                             k(A, cl_uint(0), cl_uint(0), 
-                                  cl_uint(A.size1()), cl_uint(A.size2()),
-                                  cl_uint(A.internal_size1()), cl_uint(A.internal_size2()),
-                               B, cl_uint(0), cl_uint(0),
-                                  cl_uint(B.size1()), cl_uint(B.size2()),
-                                  cl_uint(B.internal_size1()), cl_uint(B.internal_size2()),
-                               C, cl_uint(0), cl_uint(0), 
-                                  cl_uint(C.size1()), cl_uint(C.size2()),
-                                  cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
-                               viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
-                               viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) ));        
-    }
 
 
-    /** @brief Carries out matrix-matrix multiplication for submatrices
-    *
-    * Implementation of C = prod(A, B); for submatrices
-    *
-    */
-    template<typename T1, typename T2, typename T3>
-    void prod_impl(const viennacl::matrix_range<T1> & A, 
-                    const viennacl::matrix_range<T2> & B, 
-                          viennacl::matrix_range<T3> & C, 
-                          int block_size = 15) // [JW] added ability to set block size from outside ..
-    {
-      typedef typename T1::value_type::value_type   value_type;
-      
-      assert(A.size1() == C.size1());
-      assert(A.size2() == B.size1());
-      assert(B.size2() == C.size2());
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.get().handle() != A.get().handle() 
-             && C.get().handle() != B.get().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AA");
-      
-      /*k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1() / 2, block_size / 2));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2() / 2, block_size / 2));
-      k.local_work_size(0, block_size / 2);
-      k.local_work_size(1, block_size / 2);*/
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      
-      viennacl::ocl::enqueue(
-          k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
-                     cl_uint(A.size1()), cl_uint(A.size2()),
-                     cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
-            B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
-                     cl_uint(B.size1()), cl_uint(B.size2()),
-                     cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
-            C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
-                     cl_uint(C.size1()), cl_uint(C.size2()),
-                     cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
-            viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
-            viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) ));        
+    //
+    /////////////////////////   matrix-matrix products /////////////////////////////////
+    //
+    
+    namespace detail
+    {
+      // C = A * B and possibly transposed variants
+      template <typename T1, typename T2, typename T3 >
+      void prod(const T1 & A, 
+                const T2 & B, 
+                T3 & C,
+                std::string kernel_name,
+                int block_size = 16) // [JW] added ability to set block size from outside ..
+      {
+        typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
+        
+        typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType    KernelClass;
+        KernelClass::init();
+        
+        //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), kernel_name);
+        
+        k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(C), block_size));
+        k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(C), block_size));
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(A), 
+                                        cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)), 
+                                        cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
+                                        cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
+                                 viennacl::traits::handle(B), 
+                                        cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)), 
+                                        cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
+                                        cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
+                                 viennacl::traits::handle(C), 
+                                        cl_uint(viennacl::traits::start1(C)),         cl_uint(viennacl::traits::start2(C)), 
+                                        cl_uint(viennacl::traits::size1(C)),          cl_uint(viennacl::traits::size2(C)),
+                                        cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C)),
+                                 viennacl::ocl::local_mem(sizeof(cpu_value_type) * (block_size+1) * block_size),
+                                 viennacl::ocl::local_mem(sizeof(cpu_value_type) * (block_size+1) * block_size)
+                                )
+                              );        
+      }
     }
 
 
-
     /** @brief Carries out matrix-matrix multiplication
     *
-    * Implementation of C = prod(trans(A), B);
+    * Implementation of C = prod(A, B);
     *
     */
-    template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
-    void prod_impl(const viennacl::matrix_expression< const matrix<TYPE, F1, ALIGNMENT>,
-                                                      const matrix<TYPE, F1, ALIGNMENT>,
-                                                      op_trans> & A, 
-                    const viennacl::matrix<TYPE, F2, ALIGNMENT> & B, 
-                          viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
-    {
-      assert(A.size2() == C.size1());
-      assert(A.size1() == B.size1());
-      assert(B.size2() == C.size2());
+    template <typename T1, typename T2, typename T3 >
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
+                                  && viennacl::is_matrix<T3>::value
+                                >::type
+    prod_impl(const T1 & A, 
+              const T2 & B, 
+                    T3 & C, 
+              int block_size = 16) // [JW] added ability to set block size from outside ..
+    {
+      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
+      assert(viennacl::traits::size2(A) == viennacl::traits::size1(B));
+      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
       // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.handle() != A.lhs().handle() 
-             && C.handle() != B.handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F2, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TA");
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      viennacl::ocl::enqueue(
-              k(A.lhs(), cl_uint(0), cl_uint(0), 
-                         cl_uint(A.lhs().size1()), cl_uint(A.lhs().size2()),
-                         cl_uint(A.lhs().internal_size1()), cl_uint(A.lhs().internal_size2()),
-                B, cl_uint(0), cl_uint(0),
-                   cl_uint(B.size1()), cl_uint(B.size2()),
-                   cl_uint(B.internal_size1()), cl_uint(B.internal_size2()),
-                C, cl_uint(0), cl_uint(0),
-                   cl_uint(C.size1()), cl_uint(C.size2()),
-                   cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
-                viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
-                viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
-                            );        
+      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
+            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
+            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
+        
+      detail::prod(A, B, C, "prod_AA", block_size);
     }
 
 
-    /** @brief Carries out matrix-matrix multiplication for submatrices
+
+    /** @brief Carries out matrix-matrix multiplication
     *
-    * Implementation of C = prod(trans(A), B); for submatrices
+    * Implementation of C = prod(trans(A), B);
     *
     */
-    template <typename M1, typename M2, typename M3>
-    void prod_impl(const viennacl::matrix_expression< const matrix_range<M1>,
-                                                      const matrix_range<M1>,
-                                                      op_trans> & A_trans, 
-                    const viennacl::matrix_range<M2> & B, 
-                          viennacl::matrix_range<M3> & C)
-    {
-      typedef typename M1::value_type::value_type    value_type;
-      assert(A_trans.size2() == C.size1());
-      assert(A_trans.size1() == B.size1());
-      assert(B.size2() == C.size2());
+    template <typename T1, typename T2, typename T3 >
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
+                                  && viennacl::is_matrix<T3>::value
+                                >::type
+    prod_impl(const viennacl::matrix_expression< const T1,
+                                                 const T1,
+                                                 op_trans> & A, 
+              const T2 & B, 
+                    T3 & C, 
+              int block_size = 16)
+    {
+      //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
+      //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B));
+      assert(viennacl::traits::size2(B) == viennacl::traits::size2(C));
       // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.get().handle() != A_trans.lhs().get().handle() 
-             && C.get().handle() != B.get().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TA");
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
+      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
+            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B).get()
+            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
       
-      const matrix_range<M1> & A = A_trans.lhs();
-      viennacl::ocl::enqueue(
-              k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
-                         cl_uint(A.size1()), cl_uint(A.size2()),
-                         cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
-                B.get(), cl_uint(B.start1()), cl_uint(B.start2()), 
-                         cl_uint(B.size1()), cl_uint(B.size2()),
-                         cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
-                C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
-                         cl_uint(C.size1()), cl_uint(C.size2()),
-                         cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
-                viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
-                viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
-                            );        
+      detail::prod(A.lhs(), B, C, "prod_TA", block_size);
     }
 
 
@@ -610,214 +526,67 @@ namespace viennacl
     * Implementation of C = prod(A, trans(B));
     *
     */
-    template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
-    void prod_impl(const viennacl::matrix<TYPE, F1, ALIGNMENT> & A, 
-                   const viennacl::matrix_expression< const matrix<TYPE, F2, ALIGNMENT>,
-                                                      const matrix<TYPE, F2, ALIGNMENT>,
-                                                      op_trans> & B,
-                   viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
-    {
-      assert(A.size1() == C.size1());
-      assert(A.size2() == B.size2());
-      assert(B.size1() == C.size2());
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.handle() != A.handle() 
-             && C.handle() != B.lhs().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F2, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AT");
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      viennacl::ocl::enqueue(
-              k(A, cl_uint(0), cl_uint(0),
-                   cl_uint(A.size1()), cl_uint(A.size2()),
-                   cl_uint(A.internal_size1()), cl_uint(A.internal_size2()),
-                B.lhs(), cl_uint(0), cl_uint(0),
-                         cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
-                         cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()),
-                C, cl_uint(0), cl_uint(0),
-                   cl_uint(C.size1()), cl_uint(C.size2()),
-                   cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
-                viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
-                viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
-                            );        
-    }
-
-
-    /** @brief Carries out matrix-matrix multiplication for submatrices
-    *
-    * Implementation of C = prod(A, trans(B)); for submatrices
-    *
-    */
-    template <typename M1, typename M2, typename M3>
-    void prod_impl(const viennacl::matrix_range<M1> & A, 
-                   const viennacl::matrix_expression< const matrix_range<M2>,
-                                                      const matrix_range<M2>,
-                                                      op_trans> & B_trans,
-                   viennacl::matrix_range<M3> & C)
-    {
-      typedef typename M1::value_type::value_type    value_type;
-      assert(A.size1() == C.size1());
-      assert(A.size2() == B_trans.size2());
-      assert(B_trans.size1() == C.size2());
+    template <typename T1, typename T2, typename T3 >
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
+                                  && viennacl::is_matrix<T3>::value
+                                >::type
+    prod_impl(const T1 & A, 
+              const viennacl::matrix_expression< const T2,
+                                                 const T2,
+                                                 op_trans> & B,
+              T3 & C, 
+              int block_size = 16)
+    {
+      assert(viennacl::traits::size1(A) == viennacl::traits::size1(C));
+      assert(viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs()));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
       // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.get().handle() != A.get().handle() 
-             && C.get().handle() != B_trans.lhs().get().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AT");
+      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A).get() 
+            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
+            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
       
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      const matrix_range<M2> & B = B_trans.lhs();
-      viennacl::ocl::enqueue(
-              k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
-                         cl_uint(A.size1()), cl_uint(A.size2()),
-                         cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
-                B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
-                         cl_uint(B.size1()), cl_uint(B.size2()),
-                         cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
-                C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
-                         cl_uint(C.size1()), cl_uint(C.size2()),
-                         cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
-                viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
-                viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
-                            );        
+      detail::prod(A, B.lhs(), C, "prod_AT", block_size);
     }
 
 
 
-
-
-
-
-
-
     /** @brief Carries out matrix-matrix multiplication
     *
     * Implementation of C = prod(trans(A), trans(B));
     *
     */
-    template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
-    void prod_impl(const viennacl::matrix_expression< const matrix<TYPE, F1, ALIGNMENT>,
-                                                      const matrix<TYPE, F1, ALIGNMENT>,
-                                                      op_trans> & A,
-                   const viennacl::matrix_expression< const matrix<TYPE, F2, ALIGNMENT>,
-                                                      const matrix<TYPE, F2, ALIGNMENT>,
-                                                      op_trans> & B,
-                   viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
-    {
-      assert(A.size2() == C.size1());
-      assert(A.size1() == B.size2());
-      assert(B.size1() == C.size2());
+    template <typename T1, typename T2, typename T3 >
+    typename viennacl::enable_if<    viennacl::is_matrix<T1>::value
+                                  && viennacl::is_matrix<T2>::value
+                                  && viennacl::is_matrix<T3>::value
+                                >::type
+    prod_impl(const viennacl::matrix_expression< const T1,
+                                                 const T1,
+                                                 op_trans> & A,
+              const viennacl::matrix_expression< const T2,
+                                                 const T2,
+                                                 op_trans> & B,
+              T3 & C, 
+              int block_size = 16)
+    {
+      assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C));
+      assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()));
+      assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C));
       // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.handle() != A.lhs().handle() 
-             && C.handle() != B.lhs().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F2, ALIGNMENT>,
-                                                                          viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType    KernelClass;
-      KernelClass::init();
+      assert(viennacl::traits::handle(C).get() != viennacl::traits::handle(A.lhs()).get() 
+            && viennacl::traits::handle(C).get() != viennacl::traits::handle(B.lhs()).get()
+            && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
       
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TT");
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      viennacl::ocl::enqueue(
-            k(A.lhs(), cl_uint(0), cl_uint(0), 
-                       cl_uint(A.lhs().size1()), cl_uint(A.lhs().size2()),
-                       cl_uint(A.lhs().internal_size1()), cl_uint(A.lhs().internal_size2()),
-              B.lhs(), cl_uint(0), cl_uint(0), 
-                       cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
-                       cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()),
-              C, cl_uint(0), cl_uint(0), 
-                 cl_uint(C.size1()), cl_uint(C.size2()),
-                 cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
-              viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
-              viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
-                            );        
+      detail::prod(A.lhs(), B.lhs(), C, "prod_TT", block_size);
     }
 
 
-    /** @brief Carries out matrix-matrix multiplication for submatrices
-    *
-    * Implementation of C = prod(trans(A), trans(B)); for submatrices
-    *
-    */
-    template <typename M1, typename M2, typename M3>
-    void prod_impl(const viennacl::matrix_expression< const matrix_range<M1>,
-                                                      const matrix_range<M1>,
-                                                      op_trans> & A_trans,
-                   const viennacl::matrix_expression< const matrix_range<M2>,
-                                                      const matrix_range<M2>,
-                                                      op_trans> & B_trans,
-                   viennacl::matrix_range<M3> & C)
-    {
-      typedef typename M1::value_type::value_type    value_type;
-      assert(A_trans.size2() == C.size1());
-      assert(A_trans.size1() == B_trans.size2());
-      assert(B_trans.size1() == C.size2());
-      // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
-      assert(C.get().handle() != A_trans.lhs().get().handle() 
-             && C.get().handle() != B_trans.lhs().get().handle()
-             && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
-      
-      int block_size = 15;
-
-      typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType    KernelClass;
-      KernelClass::init();
-      
-      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TT");
-      
-      k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
-      k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
-      
-      k.local_work_size(0, block_size);
-      k.local_work_size(1, block_size);
-      const matrix_range<M1> & A = A_trans.lhs();
-      const matrix_range<M2> & B = B_trans.lhs();
-      viennacl::ocl::enqueue(
-            k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
-                       cl_uint(A.size1()), cl_uint(A.size2()),
-                       cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
-              B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
-                       cl_uint(B.size1()), cl_uint(B.size2()),
-                       cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
-              C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
-                       cl_uint(C.size1()), cl_uint(C.size2()),
-                       cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
-              viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
-              viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
-                            );        
-    }
-
 
 
+    //
+    /////////////////////////   miscellaneous operations /////////////////////////////////
+    //
 
 
 
@@ -897,221 +666,231 @@ namespace viennacl
   } //namespace linalg
 
 
-    //v = A * x
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
 
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
+
+  //
+  /////////////////////////  Operator overloads /////////////////////////////////
+  //
+
+
+
+
+
+  //v = A * x
+  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
+                                                                                        viennacl::op_prod> & proxy) 
+  {
+    // check for the special case x = A * x
+    if (proxy.rhs().handle().get() == this->handle().get())
     {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
       viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
+      *this = result;
       return *this;
     }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
+    else
     {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
+      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
       return *this;
     }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+    return *this;
+  }
+
+  //v += A * x
+  /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
                                                                                 op_prod> & proxy) 
-    {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+  {
+    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    *this += result;
+    return *this;
+  }
+
+  /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
                                                                                 op_prod> & proxy) 
+  {
+    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    *this -= result;
+    return *this;
+  }
+  
+  
+  //free functions:
+  /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                              const vector<SCALARTYPE, ALIGNMENT>,
+                                                                              op_prod> & proxy) 
+  {
+    assert(proxy.lhs().size1() == size());
+    vector<SCALARTYPE, ALIGNMENT> result(size());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result += *this;
+    return result;
+  }
+
+  /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                              const vector<SCALARTYPE, ALIGNMENT>,
+                                                                              op_prod> & proxy) 
+  {
+    assert(proxy.lhs().size1() == size());
+    vector<SCALARTYPE, ALIGNMENT> result(size());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result = *this - result;
+    return result;
+  }
+
+
+  ////////// transposed_matrix_proxy
+
+
+  //v = trans(A) * x
+  /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                                  const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                                  op_trans>,
+                                                                                        const viennacl::vector<SCALARTYPE, ALIGNMENT>,
+                                                                                        viennacl::op_prod> & proxy) 
+  {
+    // check for the special case x = trans(A) * x
+    if (proxy.rhs().handle().get() == this->handle().get())
     {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
-
-
-    ////////// transposed_matrix_proxy
-
-
-    //v = trans(A) * x
-    /** @brief Implementation of the operation v1 = A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                                   const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                                   op_trans>,
-                                                                                          const viennacl::vector<SCALARTYPE, ALIGNMENT>,
-                                                                                          viennacl::op_prod> & proxy) 
-    {
-      // check for the special case x = trans(A) * x
-      if (proxy.rhs().handle() == this->handle())
-      {
-        viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-        *this = result;
-        return *this;
-      }
-      else
-      {
-        viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-        return *this;
-      }
-      return *this;
-    }
-
-    //v += A * x
-    /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                          const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                          op_trans>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
-    {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+      viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
       viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this += result;
+      *this = result;
       return *this;
     }
-
-    /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> & 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                          const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                          op_trans>,
-                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
-                                                                                 op_prod> & proxy) 
+    else
     {
-      vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      *this -= result;
+      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
       return *this;
     }
-    
-    
-    //free functions:
-    /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                         const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                         op_trans>,
+    return *this;
+  }
+
+  //v += A * x
+  /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        op_trans>,
                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
                                                                                 op_prod> & proxy) 
-    {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result += *this;
-      return result;
-    }
-
-    /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
-    *
-    * @param proxy  An expression template proxy class.
-    */
-    template <typename SCALARTYPE, unsigned int ALIGNMENT>
-    template <typename F, unsigned int MAT_ALIGNMENT>
-    viennacl::vector<SCALARTYPE, ALIGNMENT> 
-    viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                         const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
-                                                                                                         op_trans>,
+  {
+    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    *this += result;
+    return *this;
+  }
+
+  /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> & 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        op_trans>,
                                                                                 const vector<SCALARTYPE, ALIGNMENT>,
                                                                                 op_prod> & proxy) 
-    {
-      assert(proxy.lhs().size1() == size());
-      vector<SCALARTYPE, ALIGNMENT> result(size());
-      viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-      result = *this - result;
-      return result;
-    }
+  {
+    vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    *this -= result;
+    return *this;
+  }
+  
+  
+  //free functions:
+  /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        op_trans>,
+                                                                              const vector<SCALARTYPE, ALIGNMENT>,
+                                                                              op_prod> & proxy) 
+  {
+    assert(proxy.lhs().size1() == size());
+    vector<SCALARTYPE, ALIGNMENT> result(size());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result += *this;
+    return result;
+  }
+
+  /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+  *
+  * @param proxy  An expression template proxy class.
+  */
+  template <typename SCALARTYPE, unsigned int ALIGNMENT>
+  template <typename F, unsigned int MAT_ALIGNMENT>
+  viennacl::vector<SCALARTYPE, ALIGNMENT> 
+  viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
+                                                                                                        op_trans>,
+                                                                              const vector<SCALARTYPE, ALIGNMENT>,
+                                                                              op_prod> & proxy) 
+  {
+    assert(proxy.lhs().size1() == size());
+    vector<SCALARTYPE, ALIGNMENT> result(size());
+    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+    result = *this - result;
+    return result;
+  }
 
 
 } //namespace viennacl
diff --git a/viennacl/linalg/norm_1.hpp b/viennacl/linalg/norm_1.hpp
index 5805be6..2428a99 100644
--- a/viennacl/linalg/norm_1.hpp
+++ b/viennacl/linalg/norm_1.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_NORM_1_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -42,10 +42,10 @@ namespace viennacl
     // UBLAS
     //
     template< typename VectorT >
-    typename VectorT::value_type
-    norm_1(VectorT const& vector, 
-         typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                     >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type      
+                                >::type    
+    norm_1(VectorT const& vector)
     {
       // std::cout << "ublas .. " << std::endl;
       return boost::numeric::ublas::norm_1(vector);
diff --git a/viennacl/linalg/norm_2.hpp b/viennacl/linalg/norm_2.hpp
index d046a18..3c38c28 100644
--- a/viennacl/linalg/norm_2.hpp
+++ b/viennacl/linalg/norm_2.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_NORM_2_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/norm_inf.hpp b/viennacl/linalg/norm_inf.hpp
index a5ac852..8ddcd20 100644
--- a/viennacl/linalg/norm_inf.hpp
+++ b/viennacl/linalg/norm_inf.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_NORM_INF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -42,10 +42,10 @@ namespace viennacl
     // UBLAS
     //
     template< typename VectorT >
-    typename VectorT::value_type
-    norm_inf(VectorT const& v1, 
-         typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value
-                                            >::type* dummy = 0)
+    typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+                                  typename VectorT::value_type      
+                                >::type    
+    norm_inf(VectorT const& v1)
     {
       // std::cout << "ublas .. " << std::endl;
       return boost::numeric::ublas::norm_inf(v1);
diff --git a/viennacl/linalg/prod.hpp b/viennacl/linalg/prod.hpp
index d5965fc..3e147f7 100644
--- a/viennacl/linalg/prod.hpp
+++ b/viennacl/linalg/prod.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_PROD_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -142,9 +142,7 @@ namespace viennacl
                                  const viennacl::matrix_range<MatrixT2>,
                                  viennacl::op_prod >
     prod(MatrixT1 const& A,
-         viennacl::matrix_range<MatrixT2> const& B, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< MatrixT1 >::type >::value
-                                     >::type* dummy = 0)
+         viennacl::matrix_range<MatrixT2> const& B)
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::matrix_expression< const MatrixT1, 
@@ -162,9 +160,7 @@ namespace viennacl
     prod(MatrixT1 const & A,
          viennacl::matrix_expression<const viennacl::matrix_range<MatrixT2>,
                                      const viennacl::matrix_range<MatrixT2>,
-                                     op_trans> const & B, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< MatrixT2 >::type >::value
-                                     >::type* dummy = 0)
+                                     op_trans> const & B)
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::matrix_expression< const MatrixT1, 
@@ -185,9 +181,7 @@ namespace viennacl
                                  const viennacl::vector<NumericT, ALIGNMENT>,
                                  viennacl::op_prod >
     prod(MatrixT const& matrix,
-         viennacl::vector<NumericT, ALIGNMENT> const& vector, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+         viennacl::vector<NumericT, ALIGNMENT> const& vector)
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::linalg::prod_impl(matrix, vector);
@@ -198,9 +192,7 @@ namespace viennacl
                                  const viennacl::matrix<NumericT, F, ALIGNMENT>,
                                  viennacl::op_prod >
     prod(MatrixT const& matrix_A,
-         viennacl::matrix<NumericT, F, ALIGNMENT> const& matrix_B, 
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+         viennacl::matrix<NumericT, F, ALIGNMENT> const& matrix_B)
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::matrix_expression< const MatrixT, 
@@ -217,9 +209,7 @@ namespace viennacl
     prod(MatrixT const& matrix_A,
          const viennacl::matrix_expression< const viennacl::matrix<NumericT, F, ALIGNMENT>, 
                                             const viennacl::matrix<NumericT, F, ALIGNMENT>,
-                                            viennacl::op_trans > & matrix_B,
-         typename viennacl::enable_if< viennacl::is_viennacl< typename viennacl::traits::tag_of< MatrixT >::type >::value
-                                     >::type* dummy = 0)
+                                            viennacl::op_trans > & matrix_B)
     {
       // std::cout << "viennacl .. " << std::endl;
       return viennacl::matrix_expression< const MatrixT, 
diff --git a/viennacl/linalg/qr.hpp b/viennacl/linalg/qr.hpp
index fd1f6b7..e0f28fd 100644
--- a/viennacl/linalg/qr.hpp
+++ b/viennacl/linalg/qr.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_QR_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -32,554 +32,829 @@
 #include "boost/numeric/ublas/vector.hpp"
 #include "boost/numeric/ublas/matrix.hpp"
 #include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
 #include "boost/numeric/ublas/io.hpp"
 #include "boost/numeric/ublas/matrix_expression.hpp"
 
 #include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
 #include "viennacl/linalg/prod.hpp"
+#include "viennacl/range.hpp"
 
 namespace viennacl
 {
-    namespace linalg
+  namespace linalg
+  {
+    namespace detail
     {
       
-        // orthogonalises j-th column of A
-        template <typename MatrixType, typename VectorType>
-        typename MatrixType::value_type setup_householder_vector(MatrixType const & A, VectorType & v, size_t j)
+      // orthogonalises j-th column of A
+      template <typename MatrixType, typename VectorType>
+      typename MatrixType::value_type setup_householder_vector(MatrixType const & A, VectorType & v, std::size_t j)
+      {
+        typedef typename MatrixType::value_type   ScalarType;
+        
+        //compute norm of column below diagonal:
+        ScalarType sigma = 0;
+        ScalarType beta = 0;
+        for (std::size_t k = j+1; k<A.size1(); ++k)
+          sigma += A(k, j) * A(k, j);
+
+        //get v from A:
+        v[j] = 1;
+        //ScalarType scaling = sqrt(sigma + A(j,j)*A(j,j));
+        //ScalarType scaling = sqrt(sigma);
+        ScalarType scaling = 1.0;
+        for (std::size_t k = j+1; k<A.size1(); ++k)
+          v[k] = A(k, j) / scaling;
+        sigma = sigma / (scaling * scaling);
+        ScalarType A_jj = A(j,j) / scaling;
+        
+        std::cout << "sigma: " << sigma << std::endl;
+        assert( sigma >= 0.0  && "sigma must be non-negative!");
+
+        
+        if (sigma == 0)
+          return 0;
+        else
         {
-          typedef typename MatrixType::value_type   ScalarType;
-          
-          //compute norm of column below diagonal:
-          ScalarType sigma = 0;
-          ScalarType beta = 0;
-          for (size_t k = j+1; k<A.size1(); ++k)
-            sigma += A(k, j) * A(k, j);
-
-          //get v from A:
-          for (size_t k = j+1; k<A.size1(); ++k)
-            v[k] = A(k, j);
+          ScalarType mu = sqrt(sigma + A_jj*A_jj);
+          std::cout << "mu: " << mu << std::endl;
+          std::cout << "sigma: " << sigma << std::endl;
           
-          if (sigma == 0)
-            return 0;
+          ScalarType v1;
+          if (A_jj <= 0)
+            v1 = A_jj - mu;
           else
-          {
-            ScalarType mu = sqrt(sigma + A(j,j)*A(j,j));
-            //std::cout << "mu: " << mu << std::endl;
-            //std::cout << "sigma: " << sigma << std::endl;
-            
-            ScalarType v1;
-            if (A(j,j) <= 0)
-              v1 = A(j,j) - mu;
-            else
-              v1 = -sigma / (A(j,j) + mu);
-            
-            beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
-            
-            //divide v by its diagonal element v[j]
-            v[j] = 1;
-            for (size_t k = j+1; k<A.size1(); ++k)
-              v[k] /= v1;
-          }
-            
-          return beta;
+            v1 = -sigma / (A_jj + mu);
+          
+          beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
+          
+          //divide v by its diagonal element v[j]
+          v[j] = 1;
+          std::cout << "v1: " << v1 << std::endl;
+          for (std::size_t k = j+1; k<A.size1(); ++k)
+            v[k] /= v1;
         }
+          
+        return beta;
+      }
+      
 
-        // Apply (I - beta v v^T) to the k-th column of A, where v is the reflector starting at j-th row/column
-        template <typename MatrixType, typename VectorType, typename ScalarType>
-        void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, size_t j, size_t k)
-        {
-          ScalarType v_in_col = A(j,k);
-          for (size_t i=j+1; i<A.size1(); ++i)
-            v_in_col += v[i] * A(i,k);
+      template <typename MatrixType, typename VectorType>
+      typename MatrixType::value_type setup_householder_vector_ublas(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, std::size_t j)
+      {
+        using boost::numeric::ublas::range;
+        using boost::numeric::ublas::project;
+        
+        typedef typename MatrixType::value_type   ScalarType;
+        
+        //compute norm of column below diagonal:
+        //ScalarType sigma = 0;
+        //for (std::size_t k = j+1; k<A.size1(); ++k)
+        //  sigma += A(k, j) * A(k, j);
+        matrix_1x1 = prod( trans(project(A, range(j+1, A.size1()), range(j, j+1))),
+                                 project(A, range(j+1, A.size1()), range(j, j+1))
+                         );
+        ScalarType sigma = matrix_1x1(0,0);
+        ScalarType beta = 0;
+        ScalarType A_jj = A(j,j);
+        
+        assert( sigma >= 0.0  && "sigma must be non-negative!");
 
-          for (size_t i=j; i<A.size1(); ++i)
-            A(i,k) -= beta * v_in_col * v[i];
+        //get v from A:
+        //for (std::size_t k = j+1; k<A.size1(); ++k)
+        //  v[k] = A(k, j);
+        v(j,0) = 1.0;
+        project(v, range(j+1, A.size1()), range(0,1)) = project(A, range(j+1, A.size1()), range(j,j+1));
+        
+        if (sigma == 0)
+          return 0;
+        else
+        {
+          ScalarType mu = sqrt(sigma + A_jj*A_jj);
+          //std::cout << "mu: " << mu << std::endl;
+          //std::cout << "sigma: " << sigma << std::endl;
+          
+          ScalarType v1;
+          if (A_jj <= 0)
+            v1 = A_jj - mu;
+          else
+            v1 = -sigma / (A_jj + mu);
+          
+          beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
+          
+          //divide v by its diagonal element v[j]
+          //v[j] = 1;
+          //for (std::size_t k = j+1; k<A.size1(); ++k)
+          //  v[k] /= v1;
+          project(v, range(j+1, A.size1()), range(0,1)) /= v1;
         }
-
-        // Apply (I - beta v v^T) to A, where v is the reflector starting at j-th row/column
-        template <typename MatrixType, typename VectorType, typename ScalarType>
-        void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, size_t j)
+          
+        return beta;
+      }
+
+
+      template <typename MatrixType, typename VectorType>
+      typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type 
+      setup_householder_vector_viennacl(MatrixType const & A, VectorType & v, MatrixType & matrix_1x1, std::size_t j)
+      {
+        //using boost::numeric::ublas::range;
+        //using boost::numeric::ublas::project;
+        using viennacl::range;
+        using viennacl::project;
+        
+        typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
+        
+        //compute norm of column below diagonal:
+        //ScalarType sigma = 0;
+        //for (std::size_t k = j+1; k<A.size1(); ++k)
+        //  sigma += A(k, j) * A(k, j);
+        matrix_1x1 = viennacl::linalg::prod( trans(project(A, range(j+1, A.size1()), range(j, j+1))),
+                                                   project(A, range(j+1, A.size1()), range(j, j+1))
+                                           );
+        ScalarType sigma = matrix_1x1(0,0);
+        ScalarType beta = 0;
+        ScalarType A_jj = A(j,j);
+
+        //std::cout << "sigma: " << sigma << std::endl;
+        assert( sigma >= 0.0  && "sigma must be non-negative!");
+
+
+        //get v from A:
+        //for (std::size_t k = j+1; k<A.size1(); ++k)
+        //  v[k] = A(k, j);
+        v(j,0) = 1.0;
+        project(v, range(j+1, A.size1()), range(0,1)) = project(A, range(j+1, A.size1()), range(j,j+1));
+        
+        if (sigma == 0)
+          return 0;
+        else
         {
-          size_t column_end = A.size2();
+          ScalarType mu = sqrt(sigma + A_jj*A_jj);
+          //std::cout << "mu: " << mu << std::endl;
+          //std::cout << "sigma: " << sigma << std::endl;
           
-          for (size_t k=j; k<column_end; ++k) //over columns
-            householder_reflect(A, v, beta, j, k);
+          ScalarType v1;
+          if (A_jj <= 0)
+            v1 = A_jj - mu;
+          else
+            v1 = -sigma / (A_jj + mu);
+          
+          beta = 2.0 * v1 * v1 / (sigma + v1 * v1);
+          
+          //divide v by its diagonal element v[j]
+          //v[j] = 1;
+          //for (std::size_t k = j+1; k<A.size1(); ++k)
+          //  v[k] /= v1;
+          //v(j,0) = 1.0;
+          project(v, range(j+1, A.size1()), range(0,1)) /= v1;
         }
+          
+        return beta;
+      }
+
+
+      // Apply (I - beta v v^T) to the k-th column of A, where v is the reflector starting at j-th row/column
+      template <typename MatrixType, typename VectorType, typename ScalarType>
+      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, std::size_t j, std::size_t k)
+      {
+        ScalarType v_in_col = A(j,k);
+        for (std::size_t i=j+1; i<A.size1(); ++i)
+          v_in_col += v[i] * A(i,k);
+
+        assert(v[j] == 1.0);
+        //std::cout << "v[]: " << v[0] << ", " << v[1] << ", " << v[2] << std::endl;
+        //std::cout << "v_in_col: " << v_in_col << std::endl;
+        
+        for (std::size_t i=j; i<A.size1(); ++i)
+          A(i,k) -= beta * v_in_col * v[i];
+      }
+
+      template <typename MatrixType, typename VectorType, typename ScalarType>
+      void householder_reflect_ublas(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, std::size_t j, std::size_t k)
+      {
+        using boost::numeric::ublas::range;
+        using boost::numeric::ublas::project;
+        
+        ScalarType v_in_col = A(j,k);
+        //for (std::size_t i=j+1; i<A.size1(); ++i)
+        //  v_in_col += v[i] * A(i,k);
+
+        matrix_1x1 = prod(trans(project(v, range(j+1, A.size1()), range(0, 1))),
+                         project(A, range(j+1, A.size1()), range(k,k+1)));
+        v_in_col += matrix_1x1(0,0);
+                         
+        //for (std::size_t i=j; i<A.size1(); ++i)
+        //  A(i,k) -= beta * v_in_col * v[i];
+        
+        project(A, range(j, A.size1()), range(k, k+1)) -= (beta * v_in_col) * project(v, range(j, A.size1()), range(0, 1));
+      }
+
+      template <typename MatrixType, typename VectorType, typename ScalarType>
+      void householder_reflect_viennacl(MatrixType & A, VectorType & v, MatrixType & matrix_1x1, ScalarType beta, std::size_t j, std::size_t k)
+      {
+        //using boost::numeric::ublas::range;
+        //using boost::numeric::ublas::project;
+        using viennacl::range;
+        using viennacl::project;
         
+        ScalarType v_in_col = A(j,k);
+        //for (std::size_t i=j+1; i<A.size1(); ++i)
+        //  v_in_col += v[i] * A(i,k);
+
+        matrix_1x1 = viennacl::linalg::prod(trans(project(v, range(j+1, A.size1()), range(0, 1))),
+                                                  project(A, range(j+1, A.size1()), range(k,k+1)));
+        v_in_col += matrix_1x1(0,0);
+                         
+        //for (std::size_t i=j; i<A.size1(); ++i)
+        //  A(i,k) -= beta * v_in_col * v[i];
         
-        template <typename MatrixType, typename VectorType>
-        void write_householder_to_A(MatrixType & A, VectorType const & v, size_t j)
+        if ( beta * v_in_col != 0.0)
         {
-          for (size_t i=j+1; i<A.size1(); ++i)
-            A(i,j) = v[i];
+          VectorType temp = project(v, range(j, A.size1()), range(0, 1));
+          project(v, range(j, A.size1()), range(0, 1)) *= (beta * v_in_col);
+          project(A, range(j, A.size1()), range(k, k+1)) -= project(v, range(j, A.size1()), range(0, 1));
+          project(v, range(j, A.size1()), range(0, 1)) = temp;
         }
+      }
+
+
+      // Apply (I - beta v v^T) to A, where v is the reflector starting at j-th row/column
+      template <typename MatrixType, typename VectorType, typename ScalarType>
+      void householder_reflect(MatrixType & A, VectorType & v, ScalarType beta, std::size_t j)
+      {
+        std::size_t column_end = A.size2();
         
+        for (std::size_t k=j; k<column_end; ++k) //over columns
+          householder_reflect(A, v, beta, j, k);
+      }
+      
+      
+      template <typename MatrixType, typename VectorType>
+      void write_householder_to_A(MatrixType & A, VectorType const & v, std::size_t j)
+      {
+        for (std::size_t i=j+1; i<A.size1(); ++i)
+          A(i,j) = v[i];
+      }
+      
+      template <typename MatrixType, typename VectorType>
+      void write_householder_to_A_ublas(MatrixType & A, VectorType const & v, std::size_t j)
+      {
+        //for (std::size_t i=j+1; i<A.size1(); ++i)
+        //  A(i,j) = v[i];
+        using boost::numeric::ublas::range;
+        using boost::numeric::ublas::project;
         
-        //takes an inplace QR matrix A and generates Q and R explicitly
-        template <typename MatrixType, typename VectorType>
-        void recoverQ(MatrixType const & A, VectorType const & betas, MatrixType & Q, MatrixType & R)
+        //VectorType temp = project(v, range(j+1, A.size1()));
+        project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
+      }
+
+      template <typename MatrixType, typename VectorType>
+      void write_householder_to_A_viennacl(MatrixType & A, VectorType const & v, std::size_t j)
+      {
+        //for (std::size_t i=j+1; i<A.size1(); ++i)
+        //  A(i,j) = v[i];
+        //using boost::numeric::ublas::range;
+        //using boost::numeric::ublas::project;
+        using viennacl::range;
+        using viennacl::project;
+        
+        //VectorType temp = project(v, range(j+1, A.size1()));
+        project( A, range(j+1, A.size1()), range(j, j+1) ) = project(v, range(j+1, A.size1()), range(0, 1) );;
+      }
+
+      
+      /*template<typename MatrixType>
+      std::vector<typename MatrixType::value_type> qr(MatrixType & A)
+      {
+        typedef typename MatrixType::value_type   ScalarType;
+        
+        std::vector<ScalarType> betas(A.size2());
+        std::vector<ScalarType> v(A.size1());
+
+        //copy A to Q:
+        for (size_t j=0; j<A.size2(); ++j)
         {
-          typedef typename MatrixType::value_type   ScalarType;
+            betas[j] = setup_householder_vector(A, v, j);
+            householder_reflect(A, v, betas[j], j);
+            write_householder_to_A(A, v, j);
+        }
+        
+        return betas;
+      }*/
+      
+      
+      
+      
+      class range
+      {
+        public:
+          range(std::size_t start, std::size_t end) : start_(start), end_(end) {}
           
-          std::vector<ScalarType> v(A.size1());
-
-          Q.clear();
-          R.clear();
-
-          //
-          // Recover R from upper-triangular part of A:
-          //
-          size_t i_max = std::min(R.size1(), R.size2());
-          for (size_t i=0; i<i_max; ++i)
-            for (size_t j=i; j<R.size2(); ++j)
-              R(i,j) = A(i,j);
-         
-          //
-          // Recover Q by applying all the Householder reflectors to the identity matrix:
-          //
-          for (size_t i=0; i<Q.size1(); ++i)
-            Q(i,i) = 1.0;
-
-          size_t j_max = std::min(A.size1(), A.size2());
-          for (size_t j=0; j<j_max; ++j)
+          std::size_t lower() const { return start_; }
+          std::size_t upper() const { return end_; }
+          
+        private:
+          std::size_t start_;
+          std::size_t end_;
+      };
+
+      template <typename MatrixType>
+      class sub_matrix
+      {
+        public:
+          typedef typename MatrixType::value_type value_type;
+          
+          sub_matrix(MatrixType & mat,
+                      range row_range,
+                      range col_range) : mat_(mat), row_range_(row_range), col_range_(col_range) {}
+                      
+          value_type operator()(size_t row, size_t col) const
+          {
+            assert(row < size1());
+            assert(col < size2());
+            return mat_(row + row_range_.lower(), col + col_range_.lower()); 
+          }
+                      
+          std::size_t size1() const { return row_range_.upper() - row_range_.lower(); }
+          std::size_t size2() const { return col_range_.upper() - col_range_.lower(); }
+          
+        private:
+          MatrixType & mat_;
+          range row_range_;
+          range col_range_;
+      };
+
+
+      //computes C = prod(A, B)
+      template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+      void prod_AA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
+      {
+        assert(C.size1() == A.size1());
+        assert(A.size2() == B.size1());
+        assert(B.size2() == C.size2());
+        
+        typedef typename MatrixTypeC::value_type   ScalarType;
+        
+        for (std::size_t i=0; i<C.size1(); ++i)
+        {
+          for (std::size_t j=0; j<C.size2(); ++j)
           {
-            size_t col_index = j_max - j - 1;
-            v[col_index] = 1.0;
-            for (size_t i=col_index+1; i<A.size1(); ++i)
-              v[i] = A(i, col_index);
-            
-            /*std::cout << "Recovery with beta = " << betas[col_index] << ", j=" << col_index << std::endl;
-            std::cout << "v: ";
-            for (size_t i=0; i<v.size(); ++i)
-              std::cout << v[i] << ", ";
-            std::cout << std::endl;*/
-
-            if (betas[col_index] != 0)
-              householder_reflect(Q, v, betas[col_index], col_index);
+            ScalarType val = 0;
+            for (std::size_t k=0; k<A.size2(); ++k)
+              val += A(i, k) * B(k, j);
+            C(i, j) = val;
           }
         }
-       
-        /*template<typename MatrixType>
-        std::vector<typename MatrixType::value_type> qr(MatrixType & A)
+      }
+      
+      //computes C = prod(A^T, B)
+      template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+      void prod_TA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
+      {
+        assert(C.size1() == A.size2());
+        assert(A.size1() == B.size1());
+        assert(B.size2() == C.size2());
+        
+        typedef typename MatrixTypeC::value_type   ScalarType;
+        
+        for (std::size_t i=0; i<C.size1(); ++i)
         {
-          typedef typename MatrixType::value_type   ScalarType;
-          
-          std::vector<ScalarType> betas(A.size2());
-          std::vector<ScalarType> v(A.size1());
-
-          //copy A to Q:
-          for (size_t j=0; j<A.size2(); ++j)
+          for (std::size_t j=0; j<C.size2(); ++j)
           {
-             betas[j] = setup_householder_vector(A, v, j);
-             householder_reflect(A, v, betas[j], j);
-             write_householder_to_A(A, v, j);
+            ScalarType val = 0;
+            for (std::size_t k=0; k<A.size1(); ++k)
+              val += A(k, i) * B(k, j);
+            C(i, j) = val;
           }
-          
-          return betas;
-        }*/
+        }
+      }
+      
+
+    } //namespace detail
         
+
+
+    //takes an inplace QR matrix A and generates Q and R explicitly
+    template <typename MatrixType, typename VectorType>
+    void recoverQ(MatrixType const & A, VectorType const & betas, MatrixType & Q, MatrixType & R)
+    {
+      typedef typename MatrixType::value_type   ScalarType;
+      
+      std::vector<ScalarType> v(A.size1());
+
+      Q.clear();
+      R.clear();
+
+      //
+      // Recover R from upper-triangular part of A:
+      //
+      std::size_t i_max = std::min(R.size1(), R.size2());
+      for (std::size_t i=0; i<i_max; ++i)
+        for (std::size_t j=i; j<R.size2(); ++j)
+          R(i,j) = A(i,j);
+      
+      //
+      // Recover Q by applying all the Householder reflectors to the identity matrix:
+      //
+      for (std::size_t i=0; i<Q.size1(); ++i)
+        Q(i,i) = 1.0;
+
+      std::size_t j_max = std::min(A.size1(), A.size2());
+      for (std::size_t j=0; j<j_max; ++j)
+      {
+        std::size_t col_index = j_max - j - 1;
+        v[col_index] = 1.0;
+        for (std::size_t i=col_index+1; i<A.size1(); ++i)
+          v[i] = A(i, col_index);
         
+        /*std::cout << "Recovery with beta = " << betas[col_index] << ", j=" << col_index << std::endl;
+        std::cout << "v: ";
+        for (size_t i=0; i<v.size(); ++i)
+          std::cout << v[i] << ", ";
+        std::cout << std::endl;*/
+
+        if (betas[col_index] != 0)
+          detail::householder_reflect(Q, v, betas[col_index], col_index);
+      }
+    }
+
+
+    /** @brief Implementation of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
+     * 
+     * @param A            A dense compatible to Boost.uBLAS
+     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     */
+    template<typename MatrixType>
+    std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A, std::size_t block_size = 32)
+    {
+      typedef typename MatrixType::value_type   ScalarType;
+      typedef boost::numeric::ublas::matrix_range<MatrixType>  MatrixRange;
+      
+      using boost::numeric::ublas::range;
+      using boost::numeric::ublas::project;
+      
+      std::vector<ScalarType> betas(A.size2());
+      //boost::numeric::ublas::vector<ScalarType> v(A.size1());
+      MatrixType v(A.size1(), 1);
+      MatrixType matrix_1x1(1,1);
+
+      MatrixType Y(A.size1(), block_size); Y.clear(); Y.resize(A.size1(), block_size);
+      MatrixType W(A.size1(), block_size); W.clear(); W.resize(A.size1(), block_size);
         
-        class range
+      //run over A in a block-wise manner:
+      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+      {
+        //determine Householder vectors:
+        for (std::size_t k = 0; k < block_size; ++k)
         {
-          public:
-            range(size_t start, size_t end) : start_(start), end_(end) {}
-            
-            size_t lower() const { return start_; }
-            size_t upper() const { return end_; }
-            
-          private:
-            size_t start_;
-            size_t end_;
-        };
-
-        template <typename MatrixType>
-        class sub_matrix
+          betas[j+k] = detail::setup_householder_vector_ublas(A, v, matrix_1x1, j+k);
+          
+          for (std::size_t l = k; l < block_size; ++l)
+            detail::householder_reflect_ublas(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+          detail::write_householder_to_A_ublas(A, v, j+k);
+        }
+
+        //
+        // Setup Y:
+        //
+        Y.clear();  Y.resize(A.size1(), block_size);
+        for (std::size_t k = 0; k < block_size; ++k)
         {
-          public:
-            typedef typename MatrixType::value_type value_type;
-            
-            sub_matrix(MatrixType & mat,
-                       range row_range,
-                       range col_range) : mat_(mat), row_range_(row_range), col_range_(col_range) {}
-                       
-            value_type operator()(size_t row, size_t col) const
-            {
-              assert(row < size1());
-              assert(col < size2());
-              return mat_(row + row_range_.lower(), col + col_range_.lower()); 
-            }
-                       
-            size_t size1() const { return row_range_.upper() - row_range_.lower(); }
-            size_t size2() const { return col_range_.upper() - col_range_.lower(); }
-            
-          private:
-            MatrixType & mat_;
-            range row_range_;
-            range col_range_;
-        };
-
-
-        //computes C = prod(A, B)
-        template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
-        void prod_AA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
+          //write Householder to Y:
+          Y(j+k,k) = 1.0;
+          project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
+        }
+        
+        //
+        // Setup W:
+        //
+        
+        //first vector:
+        W.clear();  W.resize(A.size1(), block_size);
+        W(j, 0) = -betas[j];
+        project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
+        
+        
+        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+        for (std::size_t k = 1; k < block_size; ++k)
         {
-          assert(C.size1() == A.size1());
-          assert(A.size2() == B.size1());
-          assert(B.size2() == C.size2());
-          
-          typedef typename MatrixTypeC::value_type   ScalarType;
+          MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+          MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+          MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+          MatrixRange z     = project(W, range(j, A.size1()), range(k, k+1));
           
-          for (size_t i=0; i<C.size1(); ++i)
-          {
-            for (size_t j=0; j<C.size2(); ++j)
-            {
-              ScalarType val = 0;
-              for (size_t k=0; k<A.size2(); ++k)
-                val += A(i, k) * B(k, j);
-              C(i, j) = val;
-            }
-          }
+          MatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+          z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
         }
+
+        //
+        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+        //
         
-        //computes C = prod(A^T, B)
-        template <typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
-        void prod_TA(MatrixTypeA const & A, MatrixTypeB const & B, MatrixTypeC & C)
+        if (A.size2() - j - block_size > 0)
         {
-          assert(C.size1() == A.size2());
-          assert(A.size1() == B.size1());
-          assert(B.size2() == C.size2());
           
-          typedef typename MatrixTypeC::value_type   ScalarType;
+          MatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
+          MatrixRange W_part(W, range(j, A.size1()), range(0, block_size));
+          MatrixType temp = boost::numeric::ublas::prod(trans(W_part), A_part);
           
-          for (size_t i=0; i<C.size1(); ++i)
-          {
-            for (size_t j=0; j<C.size2(); ++j)
-            {
-              ScalarType val = 0;
-              for (size_t k=0; k<A.size1(); ++k)
-                val += A(k, i) * B(k, j);
-              C(i, j) = val;
-            }
-          }
+          A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
+                         temp);
+        }
+      }
+
+      return betas;
+    }
+
+
+    /** @brief Implementation of a OpenCL-only QR factorization for GPUs (or multi-core CPU) 
+     * 
+     * Performance is rather poor at small matrix sizes.
+     * Prefer the use of the hybrid version, which is automatically chosen using the interface function inplace_qr()
+     * 
+     * @param A            A dense ViennaCL matrix to be factored
+     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     */
+    template<typename MatrixType>
+    std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
+    inplace_qr_viennacl(MatrixType & A, std::size_t block_size = 16)
+    {
+      typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
+      typedef viennacl::matrix_range<MatrixType>  MatrixRange;
+      
+      //using boost::numeric::ublas::range;
+      //using boost::numeric::ublas::project;
+      using viennacl::range;
+      using viennacl::project;
+      
+      std::vector<ScalarType> betas(A.size2());
+      //boost::numeric::ublas::vector<ScalarType> v(A.size1());
+      MatrixType v(A.size1(), 1);
+      MatrixType matrix_1x1(1,1);
+
+      MatrixType Y(A.size1(), block_size); Y.clear();
+      MatrixType W(A.size1(), block_size); W.clear();
+
+      MatrixType YT_prod_v(block_size, 1);
+      MatrixType z(A.size1(), 1);      
+      
+      //run over A in a block-wise manner:
+      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+      {
+        
+        //determine Householder vectors:
+        for (std::size_t k = 0; k < block_size; ++k)
+        {
+          betas[j+k] = detail::setup_householder_vector_viennacl(A, v, matrix_1x1, j+k);
+          for (std::size_t l = k; l < block_size; ++l)
+            detail::householder_reflect_viennacl(A, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+          detail::write_householder_to_A_viennacl(A, v, j+k);
+        }
+
+        //
+        // Setup Y:
+        //
+        Y.clear();
+        for (std::size_t k = 0; k < block_size; ++k)
+        {
+          //write Householder to Y:
+          Y(j+k,k) = 1.0;
+          project(Y, range(j+k+1, A.size1()), range(k, k+1)) = project(A, range(j+k+1, A.size1()), range(j+k, j+k+1));
         }
         
+        //
+        // Setup W:
+        //
+        
+        //first vector:
+        W.clear();
+        W(j, 0) = -betas[j];
+        //project(W, range(j+1, A.size1()), range(0, 1)) = -betas[j] * project(A, range(j+1, A.size1()), range(j, j+1));
+        project(W, range(j+1, A.size1()), range(0, 1)) = project(A, range(j+1, A.size1()), range(j, j+1));
+        project(W, range(j+1, A.size1()), range(0, 1)) *= -betas[j];
+        
         
+        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+        for (std::size_t k = 1; k < block_size; ++k)
+        {
+          MatrixRange Y_old = project(Y, range(j, A.size1()), range(0, k));
+          MatrixRange v_k   = project(Y, range(j, A.size1()), range(k, k+1));
+          MatrixRange W_old = project(W, range(j, A.size1()), range(0, k));
+          //MatrixRange z     = project(W, range(0, A.size1()), range(k, k+1));
+         
+          //std::cout << "should: " << k << std::endl;
+          project(YT_prod_v, range(0, k), range(0,1)) = prod(trans(Y_old), v_k);
+          project(z, range(j, A.size1()), range(0,1)) = prod(W_old, project(YT_prod_v, range(0, k), range(0,1)));
+          //project(W, range(0, A.size1()), range(k, k+1)) = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+          project(W, range(j, A.size1()), range(k, k+1)) = project(z, range(j, A.size1()), range(0,1));
+          project(W, range(j, A.size1()), range(k, k+1)) += v_k;
+          project(W, range(j, A.size1()), range(k, k+1)) *= - betas[j+k];
+        }
 
-        template<typename MatrixType>
-        std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, std::size_t block_size = 32)
+        //
+        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+        //
+        
+        if (A.size2() - j - block_size > 0)
         {
-          typedef typename MatrixType::value_type   ScalarType;
           
-          if ( A.size2() % block_size != 0 )
-            std::cout << "ViennaCL: Warning in inplace_qr(): Matrix columns are not divisible by block_size!" << std::endl;
-            
-          std::vector<ScalarType> betas(A.size2());
-          std::vector<ScalarType> v(A.size1());
-
-          //size_t block_size = 90;
-          MatrixType Y(A.size1(), block_size); Y.clear();
-          MatrixType W(A.size1(), block_size); W.clear();
-            
-          //run over A in a block-wise manner:
-          for (size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-          {
-            //determine Householder vectors:
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              betas[j+k] = setup_householder_vector(A, v, j+k);
-              for (size_t l = k; l < block_size; ++l)
-                householder_reflect(A, v, betas[j+k], j+k, j+l);
-
-              write_householder_to_A(A, v, j+k);
-            }
-
-            //
-            // Setup Y:
-            //
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              //write Householder to Y:
-              Y(k,k) = 1.0;
-              for (size_t l=k+1; l<A.size1(); ++l)
-                Y(l,k) = A(l, j+k);
-            }
-            
-            //
-            // Setup W:
-            //
-            
-            //first vector:
-            W(j, 0) = -betas[j];
-            for (size_t l=j+1; l<A.size1(); ++l)
-              W(l,0) = -betas[j] * A(l, j);
-            
-            //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-            for (size_t k = 1; k < block_size; ++k)
-            {
-              //compute Y^T v_k:
-              std::vector<ScalarType> temp(k);  //actually of size (k \times 1)
-              for (size_t l=0; l<k; ++l)
-                for (size_t n=j; n<A.size1(); ++n)
-                  temp[l] += Y(n, l) * Y(n, k);
-                
-              //compute W * temp and add to z, which is directly written to W:
-              for (size_t n=0; n<A.size1(); ++n)
-              {
-                ScalarType val = 0;
-                for (size_t l=0; l<k; ++l)
-                  val += temp[l] * W(n, l);
-                W(n, k) = -1.0 * betas[j+k] * (Y(n, k) + val);
-              }
-            }
-
-            //
-            //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-            //
-            
-            if (A.size2() - j - block_size > 0)
-            {
-              //temp = prod(W^T, A)
-              
-              MatrixType temp(block_size, A.size2() - j - block_size);
-              
-              boost::numeric::ublas::range A_rows(j, A.size1());
-              boost::numeric::ublas::range A_cols(j+block_size, A.size2());
-              boost::numeric::ublas::matrix_range<MatrixType> A_part(A, A_rows, A_cols);
-
-              viennacl::matrix<ScalarType, viennacl::column_major> gpu_A_part(A_part.size1(), A_part.size2());
-              viennacl::copy(A_part, gpu_A_part);
-
-              //transfer W
-              boost::numeric::ublas::range W_cols(0, block_size);
-              boost::numeric::ublas::matrix_range<MatrixType> W_part(W, A_rows, W_cols);
-              viennacl::matrix<ScalarType, viennacl::column_major> gpu_W(W_part.size1(), W_part.size2());
-              viennacl::copy(W_part, gpu_W);
-              
-              viennacl::matrix<ScalarType, viennacl::column_major> gpu_temp(gpu_W.size2(), gpu_A_part.size2());
-              gpu_temp = viennacl::linalg::prod(trans(gpu_W), gpu_A_part);
-              
-              
-              
-              //A += Y * temp:
-              boost::numeric::ublas::range Y_cols(0, Y.size2());
-              boost::numeric::ublas::matrix_range<MatrixType> Y_part(Y, A_rows, Y_cols);
-              
-              viennacl::matrix<ScalarType, viennacl::column_major> gpu_Y(Y_part.size1(), Y_part.size2());
-              viennacl::copy(Y_part, gpu_Y);
-
-              //A_part += prod(Y_part, temp);
-              gpu_A_part += prod(gpu_Y, gpu_temp);
-              
-              MatrixType A_part_back(A_part.size1(), A_part.size2());
-              viennacl::copy(gpu_A_part, A_part_back);
-                
-              A_part = A_part_back;
-              //A_part += prod(Y_part, temp);
-            }
-          }
+          MatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
+          MatrixRange W_part(W, range(j, A.size1()), range(0, block_size));
+          MatrixType temp = prod(trans(W_part), A_part);
           
-          return betas;
+          A_part += prod(project(Y, range(j, A.size1()), range(0, Y.size2())),
+                         temp);
         }
+      }
+
+      return betas;
+    }
+
+
+
+
+
 
+    //MatrixType is ViennaCL-matrix
+    /** @brief Implementation of a hybrid QR factorization using uBLAS on the CPU and ViennaCL for GPUs (or multi-core CPU) 
+     * 
+     * Prefer the use of the convenience interface inplace_qr()
+     * 
+     * @param A            A dense ViennaCL matrix to be factored
+     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     */
+    template<typename MatrixType>
+    std::vector< typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type > 
+    inplace_qr_hybrid(MatrixType & A, std::size_t block_size = 16)
+    {
+      typedef typename viennacl::result_of::cpu_value_type< typename MatrixType::value_type >::type   ScalarType;
 
-        template<typename MatrixType>
-        std::vector<typename MatrixType::value_type> inplace_qr_ublas(MatrixType & A)
+      typedef viennacl::matrix_range<MatrixType>                    VCLMatrixRange;
+      typedef boost::numeric::ublas::matrix<ScalarType>             UblasMatrixType;
+      typedef boost::numeric::ublas::matrix_range<UblasMatrixType>  UblasMatrixRange;
+      
+      //using boost::numeric::ublas::range;
+      //using boost::numeric::ublas::project;
+      
+      std::vector<ScalarType> betas(A.size2());
+      UblasMatrixType v(A.size1(), 1);
+      UblasMatrixType matrix_1x1(1,1);
+
+      UblasMatrixType ublasW(A.size1(), block_size); ublasW.clear(); ublasW.resize(A.size1(), block_size);
+      UblasMatrixType ublasY(A.size1(), block_size); ublasY.clear(); ublasY.resize(A.size1(), block_size);
+      
+      UblasMatrixType ublasA(A.size1(), A.size1());
+      
+      MatrixType vclW(ublasW.size1(), ublasW.size2());
+      MatrixType vclY(ublasY.size1(), ublasY.size2());
+      
+        
+      //run over A in a block-wise manner:
+      for (std::size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
+      {
+        UblasMatrixRange ublasA_part = boost::numeric::ublas::project(ublasA,
+                                                                      boost::numeric::ublas::range(0, A.size1()),
+                                                                      boost::numeric::ublas::range(j, j+block_size));
+        viennacl::copy(viennacl::project(A,
+                                         viennacl::range(0, A.size1()),
+                                         viennacl::range(j, j+block_size)),
+                       ublasA_part
+                      );
+        
+        //determine Householder vectors:
+        for (std::size_t k = 0; k < block_size; ++k)
         {
-          typedef typename MatrixType::value_type   ScalarType;
-          
-          std::vector<ScalarType> betas(A.size2());
-          std::vector<ScalarType> v(A.size1());
-
-          size_t block_size = 3;
-          MatrixType Y(A.size1(), block_size); Y.clear();
-          MatrixType W(A.size1(), block_size); W.clear();
-            
-          //run over A in a block-wise manner:
-          for (size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-          {
-            //determine Householder vectors:
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              betas[j+k] = setup_householder_vector(A, v, j+k);
-              for (size_t l = k; l < block_size; ++l)
-                householder_reflect(A, v, betas[j+k], j+k, j+l);
-
-              write_householder_to_A(A, v, j+k);
-            }
-
-            //
-            // Setup Y:
-            //
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              //write Householder to Y:
-              Y(k,k) = 1.0;
-              for (size_t l=k+1; l<A.size1(); ++l)
-                Y(l,k) = A(l, j+k);
-            }
-            
-            //
-            // Setup W:
-            //
-            
-            //first vector:
-            W(j, 0) = -betas[j];
-            for (size_t l=j+1; l<A.size1(); ++l)
-              W(l,0) = -betas[j] * A(l, j);
-            
-            //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-            for (size_t k = 1; k < block_size; ++k)
-            {
-              //compute Y^T v_k:
-              std::vector<ScalarType> temp(k);  //actually of size (k \times 1)
-              for (size_t l=0; l<k; ++l)
-                for (size_t n=j; n<A.size1(); ++n)
-                  temp[l] += Y(n, l) * Y(n, k);
-                
-              //compute W * temp and add to z, which is directly written to W:
-              for (size_t n=0; n<A.size1(); ++n)
-              {
-                ScalarType val = 0;
-                for (size_t l=0; l<k; ++l)
-                  val += temp[l] * W(n, l);
-                W(n, k) = -1.0 * betas[j+k] * (Y(n, k) + val);
-              }
-            }
-
-            //
-            //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-            //
-            
-            if (A.size2() - j - block_size > 0)
-            {
-              //temp = prod(W^T, A)
-              MatrixType temp(block_size, A.size2() - j - block_size);
-              
-              boost::numeric::ublas::range A_rows(j, A.size1());
-              boost::numeric::ublas::range A_cols(j+block_size, A.size2());
-              boost::numeric::ublas::matrix_range<MatrixType> A_part(A, A_rows, A_cols);
-
-              boost::numeric::ublas::range W_cols(0, block_size);
-              boost::numeric::ublas::matrix_range<MatrixType> W_part(W, A_rows, W_cols);
-              
-              temp = boost::numeric::ublas::prod(trans(W_part), A_part);
-              
-              
-              //A += Y * temp:
-              boost::numeric::ublas::range Y_cols(0, Y.size2());
-              boost::numeric::ublas::matrix_range<MatrixType> Y_part(Y, A_rows, Y_cols);
-              
-              A_part += prod(Y_part, temp);
-            }
-          }
+          betas[j+k] = detail::setup_householder_vector_ublas(ublasA, v, matrix_1x1, j+k);
           
-          return betas;
+          for (std::size_t l = k; l < block_size; ++l)
+            detail::householder_reflect_ublas(ublasA, v, matrix_1x1, betas[j+k], j+k, j+l);
+
+          detail::write_householder_to_A_ublas(ublasA, v, j+k);
         }
 
+        //
+        // Setup Y:
+        //
+        ublasY.clear();  ublasY.resize(A.size1(), block_size);
+        for (std::size_t k = 0; k < block_size; ++k)
+        {
+          //write Householder to Y:
+          ublasY(j+k,k) = 1.0;
+          boost::numeric::ublas::project(ublasY, 
+                                         boost::numeric::ublas::range(j+k+1, A.size1()), 
+                                         boost::numeric::ublas::range(k, k+1)) 
+            = boost::numeric::ublas::project(ublasA, 
+                                             boost::numeric::ublas::range(j+k+1, A.size1()),
+                                             boost::numeric::ublas::range(j+k, j+k+1));
+        }
+        
+        //
+        // Setup W:
+        //
+        
+        //first vector:
+        ublasW.clear();  ublasW.resize(A.size1(), block_size);
+        ublasW(j, 0) = -betas[j];
+        boost::numeric::ublas::project(ublasW, 
+                                       boost::numeric::ublas::range(j+1, A.size1()), 
+                                       boost::numeric::ublas::range(0, 1)) 
+           = -betas[j] * boost::numeric::ublas::project(ublasA, 
+                                                        boost::numeric::ublas::range(j+1, A.size1()), 
+                                                        boost::numeric::ublas::range(j, j+1));
+        
+        
+        //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
+        for (std::size_t k = 1; k < block_size; ++k)
+        {
+          UblasMatrixRange Y_old = boost::numeric::ublas::project(ublasY,
+                                                                  boost::numeric::ublas::range(j, A.size1()),
+                                                                  boost::numeric::ublas::range(0, k));
+          UblasMatrixRange v_k   = boost::numeric::ublas::project(ublasY,
+                                                                  boost::numeric::ublas::range(j, A.size1()),
+                                                                  boost::numeric::ublas::range(k, k+1));
+          UblasMatrixRange W_old = boost::numeric::ublas::project(ublasW, 
+                                                                  boost::numeric::ublas::range(j, A.size1()), 
+                                                                  boost::numeric::ublas::range(0, k));
+          UblasMatrixRange z     = boost::numeric::ublas::project(ublasW, 
+                                                                  boost::numeric::ublas::range(j, A.size1()), 
+                                                                  boost::numeric::ublas::range(k, k+1));
+          
+          UblasMatrixType YT_prod_v = boost::numeric::ublas::prod(boost::numeric::ublas::trans(Y_old), v_k);
+          z = - betas[j+k] * (v_k + prod(W_old, YT_prod_v));
+        }
+        
+        
 
-        template<typename MatrixType>
-        std::vector<typename MatrixType::value_type> inplace_qr_pure(MatrixType & A)
+        //
+        //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
+        //
+        
+        VCLMatrixRange A_part = viennacl::project(A,
+                                                  viennacl::range(0, A.size1()),
+                                                  viennacl::range(j, j+block_size));
+        
+        viennacl::copy(boost::numeric::ublas::project(ublasA,
+                                                      boost::numeric::ublas::range(0, A.size1()),
+                                                      boost::numeric::ublas::range(j, j+block_size)),
+                       A_part);
+        
+        viennacl::copy(ublasW, vclW);
+        viennacl::copy(ublasY, vclY);
+        
+        if (A.size2() - j - block_size > 0)
         {
-          typedef typename MatrixType::value_type   ScalarType;
           
-          std::vector<ScalarType> betas(A.size2());
-          std::vector<ScalarType> v(A.size1());
-
-          size_t block_size = 5;
-          MatrixType Y(A.size1(), block_size); Y.clear();
-          MatrixType W(A.size1(), block_size); W.clear();
-            
-          //run over A in a block-wise manner:
-          for (size_t j = 0; j < std::min(A.size1(), A.size2()); j += block_size)
-          {
-            //determine Householder vectors:
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              betas[j+k] = setup_householder_vector(A, v, j+k);
-              for (size_t l = k; l < block_size; ++l)
-                householder_reflect(A, v, betas[j+k], j+k, j+l);
-
-              write_householder_to_A(A, v, j+k);
-            }
-
-            //
-            // Setup Y:
-            //
-            for (size_t k = 0; k < block_size; ++k)
-            {
-              //write Householder to Y:
-              Y(k,k) = 1.0;
-              for (size_t l=k+1; l<A.size1(); ++l)
-                Y(l,k) = A(l, j+k);
-            }
-            
-            //
-            // Setup W:
-            //
-            
-            //first vector:
-            W(j, 0) = -betas[j];
-            for (size_t l=j+1; l<A.size1(); ++l)
-              W(l,0) = -betas[j] * A(l, j);
-            
-            //k-th column of W is given by -beta * (Id + W*Y^T) v_k, where W and Y have k-1 columns
-            for (size_t k = 1; k < block_size; ++k)
-            {
-              //compute Y^T v_k:
-              std::vector<ScalarType> temp(k);  //actually of size (k \times 1)
-              for (size_t l=0; l<k; ++l)
-                for (size_t n=j; n<A.size1(); ++n)
-                  temp[l] += Y(n, l) * Y(n, k);
-                
-              //compute W * temp and add to z, which is directly written to W:
-              for (size_t n=0; n<A.size1(); ++n)
-              {
-                ScalarType val = 0;
-                for (size_t l=0; l<k; ++l)
-                  val += temp[l] * W(n, l);
-                W(n, k) = -1.0 * betas[j+k] * (Y(n, k) + val);
-              }
-            }
-
-            //
-            //apply (I+WY^T)^T = I + Y W^T to the remaining columns of A:
-            //
-            
-            if (A.size2() - j - block_size > 0)
-            {
-              //temp = prod(W^T, A)
-              MatrixType temp(block_size, A.size2() - j - block_size);
-              ScalarType entry = 0;
-              for (size_t l = 0; l < temp.size2(); ++l)
-              {
-                for (size_t k = 0; k < temp.size1(); ++k)
-                {
-                  entry = 0;
-                  for (size_t n = j; n < A.size1(); ++n)
-                    entry += W(n, k) * A(n, j + block_size + l);
-                  temp(k,l) = entry;
-                }
-              }
-              
-              //A += Y * temp:
-              for (size_t l = j+block_size; l < A.size2(); ++l)
-              {
-                for (size_t k = j; k<A.size1(); ++k)
-                {
-                  ScalarType val = 0;
-                  for (size_t n=0; n<block_size; ++n)
-                    val += Y(k, n) * temp(n, l-j-block_size);
-                  A(k, l) += val;
-                }
-              }
-            }
-          }
+          VCLMatrixRange A_part(A, range(j, A.size1()), range(j+block_size, A.size2()));
+          VCLMatrixRange W_part(vclW, range(j, A.size1()), range(0, block_size));
+          MatrixType temp = viennacl::linalg::prod(trans(W_part), A_part);
           
-          return betas;
+          A_part += prod(viennacl::project(vclY, 
+                                           viennacl::range(j, A.size1()), 
+                                           viennacl::range(0, vclY.size2())),
+                         temp);
         }
+      }
+
+      return betas;
+    }
+
+
+
+    /** @brief Overload of inplace-QR factorization of a ViennaCL matrix A 
+     * 
+     * @param A            A dense ViennaCL matrix to be factored
+     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     */
+    template<typename T, typename F, unsigned int ALIGNMENT>
+    std::vector<T> inplace_qr(viennacl::matrix<T, F, ALIGNMENT> & A, std::size_t block_size = 16)
+    {
+      if (A.size2() % block_size != 0)
+        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
+      
+      return inplace_qr_hybrid(A, block_size);
+    }
+
+    /** @brief Overload of inplace-QR factorization for a general Boost.uBLAS compatible matrix A 
+     * 
+     * @param A            A dense compatible to Boost.uBLAS
+     * @param block_size   The block size to be used. The number of columns of A must be a multiple of block_size
+     */
+    template<typename MatrixType>
+    std::vector<typename MatrixType::value_type> inplace_qr(MatrixType & A, std::size_t block_size = 16)
+    {
+      if (A.size2() % block_size != 0)
+        std::cerr << "ViennaCL: Warning in inplace_qr(): Number of columns is not a multiple of the block size" << std::endl;
+      
+      return inplace_qr_ublas(A, block_size);
+    }
+
+
         
-    } //linalg
+  } //linalg
 } //viennacl
 
 
diff --git a/viennacl/linalg/row_scaling.hpp b/viennacl/linalg/row_scaling.hpp
index 7e199ad..45d045d 100644
--- a/viennacl/linalg/row_scaling.hpp
+++ b/viennacl/linalg/row_scaling.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_ROW_SCALING_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/spai.hpp b/viennacl/linalg/spai.hpp
index 6b07964..e10fcd9 100644
--- a/viennacl/linalg/spai.hpp
+++ b/viennacl/linalg/spai.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_SPAI_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/linalg/toeplitz_matrix_operations.hpp b/viennacl/linalg/toeplitz_matrix_operations.hpp
index 5e1fbf9..098ebbc 100644
--- a/viennacl/linalg/toeplitz_matrix_operations.hpp
+++ b/viennacl/linalg/toeplitz_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_TOEPLITZ_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -122,7 +122,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/vandermonde_matrix_operations.hpp b/viennacl/linalg/vandermonde_matrix_operations.hpp
index 01dcb68..80002d5 100644
--- a/viennacl/linalg/vandermonde_matrix_operations.hpp
+++ b/viennacl/linalg/vandermonde_matrix_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_LINALG_VANDERMONDE_MATRIX_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -118,7 +118,7 @@ namespace viennacl
                                                                                           viennacl::op_prod> & proxy) 
     {
       // check for the special case x = A * x
-      if (proxy.rhs().handle() == this->handle())
+      if (proxy.rhs().handle().get() == this->handle().get())
       {
         viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
         viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
diff --git a/viennacl/linalg/vector_operations.hpp b/viennacl/linalg/vector_operations.hpp
index bc09f19..2cceee3 100644
--- a/viennacl/linalg/vector_operations.hpp
+++ b/viennacl/linalg/vector_operations.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_VECTOR_OPERATIONS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -953,7 +953,7 @@ namespace viennacl
       //read value:
       cl_uint result;
       cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), h, CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
+      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
       VIENNACL_ERR_CHECK(err);
       return result;
     }
diff --git a/viennacl/matrix.hpp b/viennacl/matrix.hpp
index 10194eb..f7f9a4c 100644
--- a/viennacl/matrix.hpp
+++ b/viennacl/matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MATRIX_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -30,6 +30,7 @@
 #include "viennacl/tools/matrix_size_deducer.hpp"
 #include "viennacl/tools/matrix_kernel_class_deducer.hpp"
 #include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
 
 namespace viennacl
 {
@@ -87,8 +88,6 @@ namespace viennacl
     template <typename LHS, typename RHS, typename OP>
     class matrix_expression
     {
-        
-      
       public:
         ///** @brief Extracts the vector type from the two operands.
         //*/
@@ -160,7 +159,7 @@ namespace viennacl
     template <class SCALARTYPE, typename F, unsigned int ALIGNMENT>
     class matrix
     {
-      
+      typedef matrix<SCALARTYPE, F, ALIGNMENT>          self_type;
     public:
       
       typedef matrix_iterator<row_iteration, matrix<SCALARTYPE, F, ALIGNMENT> >   iterator1;
@@ -207,6 +206,25 @@ namespace viennacl
         *this = proxy;
       }
 
+      matrix(matrix_range<self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
+      {
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
+        KernelClass::init();
+        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
+        
+        *this = proxy;
+      }
+
+      matrix(matrix_range<const self_type> const & proxy) : rows_(proxy.size1()), columns_(proxy.size2())
+      {
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
+        KernelClass::init();
+        elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
+        
+        *this = proxy;
+      }
+
+
 
       //copy constructor:
       matrix(const matrix<SCALARTYPE, F, ALIGNMENT> & mat) :
@@ -214,7 +232,7 @@ namespace viennacl
         elements_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size()))
       {
         cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), mat.handle(), handle(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
+        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
       }
 
@@ -222,7 +240,7 @@ namespace viennacl
       {
         resize(mat.size1(), mat.size2(), false);
         cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), mat.handle(), handle(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
+        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), mat.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         return *this;
       }
@@ -231,7 +249,7 @@ namespace viennacl
                                                                             const matrix<SCALARTYPE, F, ALIGNMENT>,
                                                                             op_trans> & proxy)
       {
-        assert(handle() != proxy.lhs().handle() && "Self-assignment of matrix transpose not implemented");
+        assert(elements_.get() != proxy.lhs().handle().get() && "Self-assignment of matrix transpose not implemented");
         assert(proxy.lhs().size1() == size2() && "Matrix dimensions do not match!");
         assert(proxy.lhs().size2() == size1() && "Matrix dimensions do not match!");
 
@@ -239,8 +257,8 @@ namespace viennacl
         
         std::vector<SCALARTYPE> temp(proxy.lhs().internal_size());
         
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                         proxy.lhs().handle(), CL_TRUE, 0,
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         proxy.lhs().handle().get(), CL_TRUE, 0,
                                          sizeof(SCALARTYPE)*proxy.lhs().internal_size(),
                                          &(temp[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
@@ -275,6 +293,34 @@ namespace viennacl
       }
 
 
+      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<self_type> & mat)
+      {
+        resize(mat.size1(), mat.size2(), false);
+        
+        // clear matrix:
+        clear();
+        
+        // use inplace_add:
+        viennacl::linalg::inplace_add(*this, mat);
+        
+        return *this;
+      }
+
+      matrix<SCALARTYPE, F, ALIGNMENT> & operator=(const matrix_range<const self_type> & mat)
+      {
+        resize(mat.size1(), mat.size2(), false);
+        
+        // clear matrix:
+        clear();
+        
+        // use inplace_add:
+        viennacl::linalg::inplace_add(*this, mat);
+        
+        return *this;
+      }
+
+
+
 
       /** @brief Resizes the matrix.
       *   Existing entries can be preserved, but 
@@ -290,8 +336,8 @@ namespace viennacl
         {
           //get old entries:
           std::vector< SCALARTYPE > old_entries(internal_size());
-          cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), //src
-                                           handle(), //dest
+          cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), //src
+                                           elements_.get(), //dest
                                            CL_TRUE, //blocking
                                            0, //offset
                                            sizeof(SCALARTYPE)*internal_size(), //size
@@ -345,9 +391,9 @@ namespace viennacl
       {
         scalar<SCALARTYPE> tmp;
         cl_int err;
-        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(),
-                                  elements_,
-                                  tmp.handle(),
+        err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
+                                  elements_.get(),
+                                  tmp.handle().get(),
                                   sizeof(SCALARTYPE) * F::mem_index(row_index, col_index, internal_size1(), internal_size2()),
                                   0,
                                   sizeof(SCALARTYPE),
@@ -476,13 +522,29 @@ namespace viennacl
       }
 
       //this = A + B
-      matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               const matrix<SCALARTYPE, F, ALIGNMENT>,
-                                                                               op_add > & proxy) 
+      template <typename T1, typename T2>
+      matrix<SCALARTYPE, F, ALIGNMENT> &
+      operator = (const matrix_expression< const T1,
+                                           const T2,
+                                           op_add > & proxy) 
+      {
+        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
+        return *this;
+      }
+      
+      //this = A - B
+      template <typename T1, typename T2>
+      matrix<SCALARTYPE, F, ALIGNMENT> &
+      operator = (const matrix_expression< const T1,
+                                           const T2,
+                                           op_sub > & proxy) 
       {
         viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
         return *this;
       }
+      
+      
+      
 
       //this = A - B
       matrix<SCALARTYPE, F, ALIGNMENT> & operator = (const matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
@@ -502,12 +564,15 @@ namespace viennacl
       /** @brief Resets all entries to zero */
       void clear()
       {
-        std::size_t internal_size = internal_size1() * internal_size2();
-        
         typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType    KernelClass;
         
         viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "clear");
-        viennacl::ocl::enqueue(k(elements_, static_cast<cl_uint>(internal_size)));
+        viennacl::ocl::enqueue(k(elements_,
+                                 cl_uint(0), cl_uint(0),
+                                 cl_uint(size1()), cl_uint(size2()),
+                                 cl_uint(internal_size1()), cl_uint(internal_size2())
+                                )
+                              );
       }
       
       
@@ -598,7 +663,7 @@ namespace viennacl
       
       std::vector<SCALARTYPE> tmp(gpu_matrix.internal_size());
       cl_int err;
-      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(), CL_TRUE, 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &tmp[0], 0, NULL, NULL);
+      err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE) * gpu_matrix.internal_size(), &tmp[0], 0, NULL, NULL);
       VIENNACL_ERR_CHECK(err);
       viennacl::ocl::get_queue().finish();
       
@@ -874,7 +939,7 @@ namespace viennacl
       if ( (gpu_matrix.size1() > 0) && (gpu_matrix.size2() > 0) )
       {
         std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         //now copy entries to cpu_matrix:
@@ -900,7 +965,7 @@ namespace viennacl
          && (cpu_matrix.size() >= gpu_matrix.size1()) && (cpu_matrix[0].size() >= gpu_matrix.size2()))
       {
         std::vector<SCALARTYPE> temp_buffer(gpu_matrix.internal_size());
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), gpu_matrix.handle(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.internal_size(), &(temp_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         //now copy entries to cpu_matrix:
@@ -920,8 +985,8 @@ namespace viennacl
     void fast_copy(const matrix<SCALARTYPE, F, ALIGNMENT> & gpu_matrix,
                    SCALARTYPE * cpu_matrix_begin)
     {
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                       gpu_matrix.handle(), 
+      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                       gpu_matrix.handle().get(), 
                                        CL_TRUE, 0,
                                        sizeof(SCALARTYPE)*gpu_matrix.internal_size(),
                                        cpu_matrix_begin, 0, NULL, NULL);
diff --git a/viennacl/matrix_proxy.hpp b/viennacl/matrix_proxy.hpp
index 32354b8..d1ebc72 100644
--- a/viennacl/matrix_proxy.hpp
+++ b/viennacl/matrix_proxy.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MATRIX_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -34,6 +34,7 @@ namespace viennacl
   {
     public:
       typedef typename MatrixType::value_type     value_type;
+      typedef typename viennacl::result_of::cpu_value_type<value_type>::type    cpu_value_type;
       typedef range::size_type                    size_type;
       typedef range::difference_type              difference_type;
       typedef value_type                          reference;
@@ -41,7 +42,7 @@ namespace viennacl
       
       matrix_range(MatrixType & A, 
                    range const & row_range,
-                   range const & col_range) : A_(A), row_range_(row_range), col_range_(col_range) {}
+                   range const & col_range) : A_(&A), row_range_(row_range), col_range_(col_range) {}
                    
       size_type start1() const { return row_range_.start(); }
       size_type size1() const { return row_range_.size(); }
@@ -49,6 +50,74 @@ namespace viennacl
       size_type start2() const { return col_range_.start(); }
       size_type size2() const { return col_range_.size(); }
       
+      ////////// operator= //////////////////////////
+      
+      /** @brief Copy-constructor: Writes the entries from the matrix_range to the wrapped matrix.
+       * 
+       * Note: A generic overload of operator=() is insufficient, because then the compiler generates the copy-CTOR!
+       * 
+       * @param other    The submatrix to be assigned
+       */
+      matrix_range<MatrixType> & operator = (const matrix_range<MatrixType> & other) 
+      {
+        assert(size1() == other.size1());
+        assert(size2() == other.size2());
+
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
+        
+        std::size_t block_size = 16;
+        
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
+        k.global_work_size(0, block_size*block_size);
+        k.global_work_size(1, block_size*block_size);
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(*A_),
+                                        cl_uint(start1()),             cl_uint(start2()), 
+                                        cl_uint(size1()),              cl_uint(size2()),
+                                        cl_uint(A_->internal_size1()), cl_uint(A_->internal_size2()),
+                                viennacl::traits::handle(other), 
+                                        cl_uint(viennacl::traits::start1(other)),            cl_uint(viennacl::traits::start2(other)), 
+                                        cl_uint(viennacl::traits::size1(other)),             cl_uint(viennacl::traits::size2(other)),
+                                        cl_uint(viennacl::traits::internal_size1(other)),    cl_uint(viennacl::traits::internal_size2(other))
+                                )
+                              );
+
+        return *this;
+      }
+
+      template <typename MatrixType2>
+      matrix_range<MatrixType> & operator = (const MatrixType2 & other) 
+      {
+        assert(size1() == other.size1());
+        assert(size2() == other.size2());
+
+        typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< MatrixType >::ResultType    KernelClass;
+        
+        std::size_t block_size = 16;
+        
+        viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "assign");
+        k.global_work_size(0, block_size*block_size);
+        k.global_work_size(1, block_size*block_size);
+        k.local_work_size(0, block_size);
+        k.local_work_size(1, block_size);
+        
+        viennacl::ocl::enqueue(k(viennacl::traits::handle(*A_),
+                                        cl_uint(start1()),             cl_uint(start2()), 
+                                        cl_uint(size1()),              cl_uint(size2()),
+                                        cl_uint(A_->internal_size1()), cl_uint(A_->internal_size2()),
+                                viennacl::traits::handle(other), 
+                                        cl_uint(viennacl::traits::start1(other)),            cl_uint(viennacl::traits::start2(other)), 
+                                        cl_uint(viennacl::traits::size1(other)),             cl_uint(viennacl::traits::size2(other)),
+                                        cl_uint(viennacl::traits::internal_size1(other)),    cl_uint(viennacl::traits::internal_size2(other))
+                                )
+                              );
+
+        return *this;
+      }
+
+      
       template <typename MatrixType1, typename MatrixType2>
       matrix_range<MatrixType> & operator = (const matrix_expression< MatrixType1,
                                                                       MatrixType2,
@@ -58,7 +127,29 @@ namespace viennacl
         return *this;
       }
       
-      
+      template <typename MatrixType1, typename MatrixType2>
+      matrix_range<MatrixType> & 
+      operator = (const matrix_expression< MatrixType1,
+                                           MatrixType2,
+                                           op_add > & proxy) 
+      {
+        viennacl::linalg::add(proxy.lhs(), proxy.rhs(), *this);
+        return *this;
+      }
+
+      template <typename MatrixType1, typename MatrixType2>
+      matrix_range<MatrixType> & 
+      operator = (const matrix_expression< MatrixType1,
+                                           MatrixType2,
+                                           op_sub > & proxy) 
+      {
+        viennacl::linalg::sub(proxy.lhs(), proxy.rhs(), *this);
+        return *this;
+      }
+
+
+      ////////// operator+= //////////////////////////
+
       matrix_range<MatrixType> & operator += (matrix_range<MatrixType> const & other)
       {
         viennacl::linalg::inplace_add(*this, other);
@@ -70,36 +161,94 @@ namespace viennacl
                                                                        MatrixType2,
                                                                        op_prod > & proxy)
       {
-        MatrixType1 temp = proxy;
-        viennacl::range r1(0, temp.size1());
-        viennacl::range r2(0, temp.size2());
-        viennacl::matrix_range<MatrixType> temp2(temp, r1, r2);
-        viennacl::linalg::inplace_add(*this, temp2);
+        MatrixType temp = proxy;
+        viennacl::linalg::inplace_add(*this, temp);
+        return *this;
+      }
+      
+      
+      ////////// operator-= //////////////////////////
+      matrix_range<MatrixType> & operator -= (matrix_range<MatrixType> const & other)
+      {
+        viennacl::linalg::inplace_sub(*this, other);
         return *this;
       }
       
       template <typename MatrixType1, typename MatrixType2>
-      matrix_range<MatrixType> & operator += (const matrix_expression< const matrix_range<MatrixType1>,
-                                                                       const matrix_range<MatrixType2>,
+      matrix_range<MatrixType> & operator -= (const matrix_expression< MatrixType1,
+                                                                       MatrixType2,
                                                                        op_prod > & proxy)
       {
-        MatrixType1 temp(proxy.size1(), proxy.size2());
-        viennacl::range r1(0, temp.size1());
-        viennacl::range r2(0, temp.size2());
-        viennacl::matrix_range<MatrixType> temp2(temp, r1, r2);
-        temp2 = proxy;
-        viennacl::linalg::inplace_add(*this, temp2);
+        MatrixType temp = proxy;
+        viennacl::linalg::inplace_sub(*this, temp);
         return *this;
       }
 
+
+      ////////// operator*= //////////////////////////
+
+      template <typename T>
+      matrix_range<MatrixType> & operator *= (T const & val)
+      {
+        viennacl::linalg::inplace_mult(*this, val);
+        return *this;
+      }
+      
+      ////////// operator/= //////////////////////////
+
+      template <typename T>
+      matrix_range<MatrixType> & operator /= (T const & val)
+      {
+        viennacl::linalg::inplace_divide(*this, val);
+        return *this;
+      }
+
+      matrix_range<MatrixType> & operator /= (cpu_value_type val)
+      {
+        viennacl::linalg::inplace_mult(*this, cpu_value_type(1.0) / val);
+        return *this;
+      }
+
+
+      ////////// operator+ //////////////////////////
+      
+      template <typename MatrixType2>
+      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
+                                    matrix_expression< const matrix_range<MatrixType>,
+                                                       const MatrixType2,
+                                                       op_add > >::type
+      operator + (const MatrixType2 & other) 
+      {
+        return matrix_expression< const matrix_range<MatrixType>,
+                                  const MatrixType2,
+                                  op_add > (*this, other);
+      }
+      
+      ////////// operator- //////////////////////////
+      
+      template <typename MatrixType2>
+      typename viennacl::enable_if< viennacl::is_matrix<MatrixType2>::value,
+                                    matrix_expression< const matrix_range<MatrixType>,
+                                                       const MatrixType2,
+                                                       op_sub > >::type
+      operator - (const MatrixType2 & other) 
+      {
+        return matrix_expression< const matrix_range<MatrixType>,
+                                  const MatrixType2,
+                                  op_sub > (*this, other);
+      }
+      
+      
+      
+
       //const_reference operator()(size_type i, size_type j) const { return A_(start1() + i, start2() + i); }
       //reference operator()(size_type i, size_type j) { return A_(start1() + i, start2() + i); }
 
-      MatrixType & get() { return A_; }
-      const MatrixType & get() const { return A_; }
+      MatrixType & get() { return *A_; }
+      const MatrixType & get() const { return *A_; }
 
     private:
-      MatrixType & A_;
+      MatrixType * A_;
       range row_range_;
       range col_range_;
   };
@@ -143,8 +292,8 @@ namespace viennacl
          
          size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
          size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_matrix_range.get().handle(), CL_TRUE, 
+         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                           sizeof(SCALARTYPE)*start_offset,
                                           sizeof(SCALARTYPE)*num_entries,
                                           &(entries[0]), 0, NULL, NULL);
@@ -165,8 +314,8 @@ namespace viennacl
        size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
        size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
        //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_matrix_range.get().handle(), CL_TRUE, 
+       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                          sizeof(SCALARTYPE)*start_offset,
                                          sizeof(SCALARTYPE)*num_entries,
                                          &(entries[0]), 0, NULL, NULL);
@@ -195,8 +344,8 @@ namespace viennacl
          
          size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
          size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_matrix_range.get().handle(), CL_TRUE, 
+         cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                           sizeof(SCALARTYPE)*start_offset,
                                           sizeof(SCALARTYPE)*num_entries,
                                           &(entries[0]), 0, NULL, NULL);
@@ -217,8 +366,8 @@ namespace viennacl
        size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
        size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
        //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_matrix_range.get().handle(), CL_TRUE, 
+       cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                          sizeof(SCALARTYPE)*start_offset,
                                          sizeof(SCALARTYPE)*num_entries,
                                          &(entries[0]), 0, NULL, NULL);
@@ -251,8 +400,8 @@ namespace viennacl
        {
          size_t start_offset = (gpu_matrix_range.start1() + i) * gpu_matrix_range.get().internal_size2() + gpu_matrix_range.start2();
          size_t num_entries = gpu_matrix_range.size2();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_matrix_range.get().handle(), CL_TRUE, 
+         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                           sizeof(SCALARTYPE)*start_offset,
                                           sizeof(SCALARTYPE)*num_entries,
                                           &(entries[0]), 0, NULL, NULL);
@@ -272,8 +421,8 @@ namespace viennacl
        size_t start_offset = gpu_matrix_range.start1() * gpu_matrix_range.get().internal_size2();
        size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
        //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_matrix_range.get().handle(), CL_TRUE, 
+       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                          sizeof(SCALARTYPE)*start_offset,
                                          sizeof(SCALARTYPE)*num_entries,
                                          &(entries[0]), 0, NULL, NULL);
@@ -305,8 +454,8 @@ namespace viennacl
        {
          size_t start_offset = (gpu_matrix_range.start2() + j) * gpu_matrix_range.get().internal_size1() + gpu_matrix_range.start1();
          size_t num_entries = gpu_matrix_range.size1();
-         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_matrix_range.get().handle(), CL_TRUE, 
+         cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                           sizeof(SCALARTYPE)*start_offset,
                                           sizeof(SCALARTYPE)*num_entries,
                                           &(entries[0]), 0, NULL, NULL);
@@ -326,8 +475,8 @@ namespace viennacl
        size_t start_offset = gpu_matrix_range.start2() * gpu_matrix_range.get().internal_size1();
        size_t num_entries = gpu_matrix_range.size1() * gpu_matrix_range.size2();
        //std::cout << "start_offset: " << start_offset << std::endl;
-       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_matrix_range.get().handle(), CL_TRUE, 
+       cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_matrix_range.get().handle().get(), CL_TRUE, 
                                          sizeof(SCALARTYPE)*start_offset,
                                          sizeof(SCALARTYPE)*num_entries,
                                          &(entries[0]), 0, NULL, NULL);
@@ -342,17 +491,43 @@ namespace viennacl
   }
 
 
-/*
   template<typename MatrixType>
   std::ostream & operator<<(std::ostream & s, matrix_range<MatrixType> const & proxy)
   {
-    MatrixType temp(proxy.size1(), proxy.size2());
-    viennacl::range r1(0, proxy.size1());
-    viennacl::range r2(0, proxy.size2());
-    matrix_range<MatrixType> temp2(temp, r1, r2);
-    viennacl::copy(proxy, temp2);
+    MatrixType temp = proxy;
+    s << temp;
+    return s;
+  }
+
+  template<typename MatrixType>
+  std::ostream & operator<<(std::ostream & s, matrix_range<const MatrixType> const & proxy)
+  {
+    MatrixType temp = proxy;
     s << temp;
     return s;
+  }
+
+
+  //
+  // Convenience function
+  //
+  template <typename MatrixType>
+  matrix_range<MatrixType> project(MatrixType & A, viennacl::range const & r1, viennacl::range const & r2)
+  {
+    return matrix_range<MatrixType>(A, r1, r2);
+  }
+
+  /*template <typename MatrixType>
+  matrix_range<MatrixType> project(MatrixType const & A, viennacl::range const & r1, viennacl::range const & r2)
+  {
+    return matrix_range<MatrixType>(A, r1, r2);
+  }*/
+
+  //TODO: Think about const-matrix...
+  /*template <typename MatrixType>
+  matrix_range<const MatrixType> project(MatrixType const & A, viennacl::range const & r1, viennacl::range const & r2)
+  {
+    return matrix_range<MatrixType>(A, r1, r2);
   }*/
 
 
diff --git a/viennacl/meta/enable_if.hpp b/viennacl/meta/enable_if.hpp
index 90d3457..b301f9d 100644
--- a/viennacl/meta/enable_if.hpp
+++ b/viennacl/meta/enable_if.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_META_ENABLE_IF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/meta/predicate.hpp b/viennacl/meta/predicate.hpp
index 77b947b..88029b0 100644
--- a/viennacl/meta/predicate.hpp
+++ b/viennacl/meta/predicate.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_META_PREDICATE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/meta/result_of.hpp b/viennacl/meta/result_of.hpp
index 1c3d658..e73a5ab 100644
--- a/viennacl/meta/result_of.hpp
+++ b/viennacl/meta/result_of.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_META_RESULT_OF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/meta/tag_of.hpp b/viennacl/meta/tag_of.hpp
index 6f6c51d..85e833c 100644
--- a/viennacl/meta/tag_of.hpp
+++ b/viennacl/meta/tag_of.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_META_TAGOF_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/misc/bandwidth_reduction.hpp b/viennacl/misc/bandwidth_reduction.hpp
index fa9691d..be237b8 100644
--- a/viennacl/misc/bandwidth_reduction.hpp
+++ b/viennacl/misc/bandwidth_reduction.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MISC_BANDWIDTH_REDUCTION_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/misc/cuthill_mckee.hpp b/viennacl/misc/cuthill_mckee.hpp
index 01a88bf..ef9555d 100644
--- a/viennacl/misc/cuthill_mckee.hpp
+++ b/viennacl/misc/cuthill_mckee.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MISC_CUTHILL_MCKEE_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/misc/gibbs_poole_stockmeyer.hpp b/viennacl/misc/gibbs_poole_stockmeyer.hpp
index 6af4eb3..763f445 100644
--- a/viennacl/misc/gibbs_poole_stockmeyer.hpp
+++ b/viennacl/misc/gibbs_poole_stockmeyer.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_MISC_GIBBS_POOLE_STOCKMEYER_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/backend.hpp b/viennacl/ocl/backend.hpp
index 199ab8c..36bab81 100644
--- a/viennacl/ocl/backend.hpp
+++ b/viennacl/ocl/backend.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_BACKEND_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/command_queue.hpp b/viennacl/ocl/command_queue.hpp
index c176cb7..75519a2 100644
--- a/viennacl/ocl/command_queue.hpp
+++ b/viennacl/ocl/command_queue.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_COMMAND_QUEUE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -64,13 +64,13 @@ namespace viennacl
         /** @brief Waits until all kernels in the queue have finished their execution */
         void finish() const
         {
-          clFinish(handle_);
+          clFinish(handle_.get());
         }
         
         /** @brief Waits until all kernels in the queue have started their execution */
         void flush() const
         {
-          clFlush(handle_);
+          clFlush(handle_.get());
         }
 
         viennacl::ocl::handle<cl_command_queue> const & handle() const { return handle_; }
diff --git a/viennacl/ocl/context.hpp b/viennacl/ocl/context.hpp
index e2355fc..28fe598 100644
--- a/viennacl/ocl/context.hpp
+++ b/viennacl/ocl/context.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_CONTEXT_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -123,7 +123,7 @@ namespace viennacl
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
           std::cout << "ViennaCL: Adding new device to context " << h_ << std::endl;
           #endif
-          if (std::find(devices_.begin(), devices_.end(), d) != devices_.end())
+          if (std::find(devices_.begin(), devices_.end(), d) == devices_.end())
             devices_.push_back(d);
         }
 
@@ -173,7 +173,7 @@ namespace viennacl
           if (ptr)
             flags |= CL_MEM_COPY_HOST_PTR;
           cl_int err;
-          viennacl::ocl::handle<cl_mem> mem = clCreateBuffer(handle(), flags, size, ptr, &err);
+          viennacl::ocl::handle<cl_mem> mem = clCreateBuffer(h_.get(), flags, size, ptr, &err);
           VIENNACL_ERR_CHECK(err);
           return mem;
         }
@@ -207,7 +207,7 @@ namespace viennacl
           std::cout << "ViennaCL: Adding new queue for device " << dev << " to context " << h_ << std::endl;
           #endif
           cl_int err;
-          viennacl::ocl::handle<cl_command_queue> temp = clCreateCommandQueue(handle(), dev, 0, &err);
+          viennacl::ocl::handle<cl_command_queue> temp = clCreateCommandQueue(h_.get(), dev, 0, &err);
           VIENNACL_ERR_CHECK(err);
           
           queues_[dev].push_back(viennacl::ocl::command_queue(temp, dev));
@@ -263,10 +263,10 @@ namespace viennacl
           std::cout << "ViennaCL: Adding program '" << prog_name << "' to context " << h_ << std::endl;
           #endif
           
-          viennacl::ocl::handle<cl_program> temp = clCreateProgramWithSource(h_, 1, (const char **)&source_text, &source_size, &err);
+          viennacl::ocl::handle<cl_program> temp = clCreateProgramWithSource(h_.get(), 1, (const char **)&source_text, &source_size, &err);
           VIENNACL_ERR_CHECK(err);
           
-          err = clBuildProgram(temp, 0, NULL, NULL, NULL, NULL);
+          err = clBuildProgram(temp.get(), 0, NULL, NULL, NULL, NULL);
           #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_BUILD)
             char buffer[1024];
             cl_build_status status;
@@ -320,7 +320,7 @@ namespace viennacl
         /** @brief Less-than comparable for compatibility with std:map  */
         bool operator<(context const & other) const
         {
-          return h_ < other.h_;
+          return h_.get() < other.h_.get();
         }
         
       private:
@@ -356,7 +356,7 @@ namespace viennacl
               switch (device_type_)
               {
                 case CL_DEVICE_TYPE_CPU:          std::cout << "CPU"; break;
-                case CL_DEVICE_TYPE_GPU:          std::cout << "CPU"; break;
+                case CL_DEVICE_TYPE_GPU:          std::cout << "GPU"; break;
                 case CL_DEVICE_TYPE_ACCELERATOR:  std::cout << "ACCELERATOR"; break;
                 case CL_DEVICE_TYPE_DEFAULT:      std::cout << "DEFAULT"; break;
                 default:
@@ -405,7 +405,7 @@ namespace viennacl
             //Note: The obvious
             //  err = clGetContextInfo(h_, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
             //does not work with NVIDIA OpenCL stack!
-            err = clGetContextInfo(h_, CL_CONTEXT_DEVICES, VIENNACL_OCL_MAX_DEVICE_NUM * sizeof(cl_device_id), NULL, &temp);
+            err = clGetContextInfo(h_.get(), CL_CONTEXT_DEVICES, VIENNACL_OCL_MAX_DEVICE_NUM * sizeof(cl_device_id), NULL, &temp);
             VIENNACL_ERR_CHECK(err);
             assert(temp > 0 && "ViennaCL: FATAL error: Provided context does not contain any devices!");
             num_devices = temp / sizeof(cl_device_id);
@@ -415,7 +415,7 @@ namespace viennacl
             #endif
             
             std::vector<cl_device_id> device_ids(num_devices);
-            err = clGetContextInfo(h_, CL_CONTEXT_DEVICES, num_devices * sizeof(cl_device_id), &(device_ids[0]), NULL);
+            err = clGetContextInfo(h_.get(), CL_CONTEXT_DEVICES, num_devices * sizeof(cl_device_id), &(device_ids[0]), NULL);
             VIENNACL_ERR_CHECK(err);
             
             for (size_t i=0; i<num_devices; ++i)
diff --git a/viennacl/ocl/device.hpp b/viennacl/ocl/device.hpp
index 73d6263..8614b99 100644
--- a/viennacl/ocl/device.hpp
+++ b/viennacl/ocl/device.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_DEVICE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/enqueue.hpp b/viennacl/ocl/enqueue.hpp
index 3e75c61..1b001b2 100644
--- a/viennacl/ocl/enqueue.hpp
+++ b/viennacl/ocl/enqueue.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_ENQUEUE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -53,9 +53,9 @@ namespace viennacl
         
         cl_int err;
         if (tmp_global == 1 && tmp_local == 1)
-          err = clEnqueueTask(queue.handle(), k.handle(), 0, NULL, NULL);
+          err = clEnqueueTask(queue.handle().get(), k.handle().get(), 0, NULL, NULL);
         else
-          err = clEnqueueNDRangeKernel(queue.handle(), k.handle(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
+          err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
 
         if (err != CL_SUCCESS)  //if not successful, try to start with smaller work size
         {
@@ -75,7 +75,7 @@ namespace viennacl
             #endif
             
             queue.finish();
-            err = clEnqueueNDRangeKernel(queue.handle(), k.handle(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
+            err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 1, NULL, &tmp_global, &tmp_local, 0, NULL, NULL);
           }
           
           if (err != CL_SUCCESS)
@@ -112,7 +112,7 @@ namespace viennacl
         tmp_local[0] = k.local_work_size(0);
         tmp_local[1] = k.local_work_size(1);
         
-        cl_int err = clEnqueueNDRangeKernel(queue.handle(), k.handle(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL);
+        cl_int err = clEnqueueNDRangeKernel(queue.handle().get(), k.handle().get(), 2, NULL, tmp_global, tmp_local, 0, NULL, NULL);
 
         if (err != CL_SUCCESS)
         {
diff --git a/viennacl/ocl/error.hpp b/viennacl/ocl/error.hpp
index 5a767c9..988e083 100644
--- a/viennacl/ocl/error.hpp
+++ b/viennacl/ocl/error.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_ERROR_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/forwards.h b/viennacl/ocl/forwards.h
index b57c49d..b74c3be 100644
--- a/viennacl/ocl/forwards.h
+++ b/viennacl/ocl/forwards.h
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_FORWARDS_H_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/handle.hpp b/viennacl/ocl/handle.hpp
index 30ccaed..07e1038 100644
--- a/viennacl/ocl/handle.hpp
+++ b/viennacl/ocl/handle.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_HANDLE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -146,41 +146,47 @@ namespace viennacl
     class handle
     {
     public:
-      handle() : something(0) {}
-      handle(const OCL_TYPE & _something) : something(_something) {}
-      handle(const handle & h) : something(h.something) { if (something != 0) inc(); }
-      ~handle() { if (something != 0) dec(); }
-      handle & operator=(const handle & h)
+      handle() : h_(0) {}
+      handle(const OCL_TYPE & _something) : h_(_something) {}
+      handle(const handle & other) : h_(other.h_) { if (h_ != 0) inc(); }
+      ~handle() { if (h_ != 0) dec(); }
+      handle & operator=(const handle & other)
       {
-        if (something != 0) dec();
-        something = h.something;
+        if (h_ != 0) 
+          dec();
+        h_ = other.h_;
         inc();
         return *this;
       }
       handle & operator=(const OCL_TYPE & _something)
       {
-        if (something != 0) dec();
-        something = _something;
+        if (h_ != 0) dec();
+        h_ = _something;
         return *this;
       }
-      operator OCL_TYPE() const { return something; }
-      //const OCL_TYPE & get() const { return something; }
+      
+      /** @brief Implicit conversion to the plain OpenCL handle. DEPRECATED and will be removed some time in the future. */
+      operator OCL_TYPE() const { return h_; }
+      
+      const OCL_TYPE & get() const { return h_; }
+      
+      
       
       /** @brief Swaps the OpenCL handle of two handle objects */
       handle & swap(handle & other)
       {
-        OCL_TYPE tmp = other.something;
-        other.something = this->something;
-        this->something = tmp;
+        OCL_TYPE tmp = other.h_;
+        other.h_ = this->h_;
+        this->h_ = tmp;
         return *this;
       }
       
       /** @brief Manually increment the OpenCL reference count. Typically called automatically, but is necessary if user-supplied memory objects are wrapped. */
-      void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(something); };
+      void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(h_); };
       /** @brief Manually decrement the OpenCL reference count. Typically called automatically, but might be useful with user-supplied memory objects.  */
-      void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(something); };
+      void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(h_); };
     private:
-      OCL_TYPE something;
+      OCL_TYPE h_;
     };
 
     
diff --git a/viennacl/ocl/kernel.hpp b/viennacl/ocl/kernel.hpp
index 97811cd..2cd386b 100644
--- a/viennacl/ocl/kernel.hpp
+++ b/viennacl/ocl/kernel.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_KERNEL_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -101,7 +101,7 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting unsigned long kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        cl_int err = clSetKernelArg(handle_, pos, sizeof(cl_uint), (void*)&val);
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_uint), (void*)&val);
         VIENNACL_ERR_CHECK(err);
       }
 
@@ -112,7 +112,7 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting floating point kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        cl_int err = clSetKernelArg(handle_, pos, sizeof(float), (void*)&val);
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(float), (void*)&val);
         VIENNACL_ERR_CHECK(err);
       }
 
@@ -123,7 +123,7 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting double precision kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        cl_int err = clSetKernelArg(handle_, pos, sizeof(double), (void*)&val);
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(double), (void*)&val);
         VIENNACL_ERR_CHECK(err);
       }
 
@@ -136,8 +136,8 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting generic kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        cl_mem temp = val.handle();
-        cl_int err = clSetKernelArg(handle_, pos, sizeof(cl_mem), (void*)&temp);
+        cl_mem temp = val.handle().get();
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
         VIENNACL_ERR_CHECK(err);
       }
       
@@ -151,8 +151,8 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting handle kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        CL_TYPE temp = h;
-        cl_int err = clSetKernelArg(handle_, pos, sizeof(CL_TYPE), (void*)&temp);
+        CL_TYPE temp = h.get();
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
         VIENNACL_ERR_CHECK(err);
       }
       
@@ -166,7 +166,7 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Setting local memory kernel argument at pos " << pos << " for kernel " << name_ << std::endl;
         #endif
-        cl_int err = clSetKernelArg(handle_, pos, size, 0);
+        cl_int err = clSetKernelArg(handle_.get(), pos, size, 0);
         VIENNACL_ERR_CHECK(err);
       }
       
@@ -504,7 +504,7 @@ namespace viennacl
         #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
         std::cout << "ViennaCL: Building kernel " << name_ << std::endl;
         #endif
-        handle_ = clCreateKernel(program_, name_.c_str(), &err);
+        handle_ = clCreateKernel(program_.get(), name_.c_str(), &err);
         
         if (err != CL_SUCCESS)
         {
diff --git a/viennacl/ocl/local_mem.hpp b/viennacl/ocl/local_mem.hpp
index 24b58f0..11ba2df 100644
--- a/viennacl/ocl/local_mem.hpp
+++ b/viennacl/ocl/local_mem.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_LOCAL_MEM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/platform.hpp b/viennacl/ocl/platform.hpp
index e954ee9..663c2d8 100644
--- a/viennacl/ocl/platform.hpp
+++ b/viennacl/ocl/platform.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_PLATFORM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/program.hpp b/viennacl/ocl/program.hpp
index e978c20..a30e5de 100644
--- a/viennacl/ocl/program.hpp
+++ b/viennacl/ocl/program.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_PROGRAM_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/ocl/utils.hpp b/viennacl/ocl/utils.hpp
index 2769a55..92e572b 100644
--- a/viennacl/ocl/utils.hpp
+++ b/viennacl/ocl/utils.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_OCL_UTILS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/range.hpp b/viennacl/range.hpp
index 9197c83..88b13d8 100644
--- a/viennacl/range.hpp
+++ b/viennacl/range.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_RANGE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/scalar.hpp b/viennacl/scalar.hpp
index 303fdb5..195db29 100644
--- a/viennacl/scalar.hpp
+++ b/viennacl/scalar.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_SCALAR_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -111,7 +111,7 @@ namespace viennacl
       scalar(const scalar & other) : val_(viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(TYPE)))
       {
         //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), other.handle(), handle(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
+        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
       }
 
@@ -120,7 +120,7 @@ namespace viennacl
       {
         TYPE tmp;
         cl_int err;
-        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), val_, CL_TRUE, 0, sizeof(TYPE), &tmp, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &tmp, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         return tmp;
       } 
@@ -129,7 +129,7 @@ namespace viennacl
       scalar<TYPE> & operator= (entry_proxy<TYPE> const & other)
       {
         //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), other.handle(), handle(), other.index() * sizeof(TYPE), 0, sizeof(TYPE), 0, NULL, NULL);
+        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), other.index() * sizeof(TYPE), 0, sizeof(TYPE), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         return *this;
       }
@@ -138,7 +138,7 @@ namespace viennacl
       scalar<TYPE> & operator= (scalar<TYPE> const & other)
       {
         //copy value:
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), other.handle(), handle(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
+        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), other.handle().get(), val_.get(), 0, 0, sizeof(TYPE), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -147,8 +147,8 @@ namespace viennacl
       scalar<TYPE> & operator= (float cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -157,8 +157,8 @@ namespace viennacl
       scalar<TYPE> & operator= (double cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -167,8 +167,8 @@ namespace viennacl
       scalar<TYPE> & operator= (long cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -177,8 +177,8 @@ namespace viennacl
       scalar<TYPE> & operator= (unsigned long cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -187,8 +187,8 @@ namespace viennacl
       scalar<TYPE> & operator= (int cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
@@ -197,8 +197,8 @@ namespace viennacl
       scalar<TYPE> & operator= (unsigned int cpu_other)
       {
         //copy value:
-        TYPE other = cpu_other;
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), handle(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
+        TYPE other = static_cast<TYPE>(cpu_other);
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), val_.get(), CL_TRUE, 0, sizeof(TYPE), &other, 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
         
         return *this;
diff --git a/viennacl/toeplitz_matrix.hpp b/viennacl/toeplitz_matrix.hpp
index 1f67fe3..e50b1b9 100644
--- a/viennacl/toeplitz_matrix.hpp
+++ b/viennacl/toeplitz_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOEPLITZ_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/tools/adapter.hpp b/viennacl/tools/adapter.hpp
index c544a24..99467c7 100644
--- a/viennacl/tools/adapter.hpp
+++ b/viennacl/tools/adapter.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_ADAPTER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -54,7 +54,7 @@ namespace viennacl
         typedef std::size_t   size_type;
         
         const_sparse_matrix_adapted_iterator(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat, int i, int j)
-         : _mat(mat), _i(i), _j(j)
+         : mat_(mat), i_(i), j_(j)
         {
           if (i < 0) //reverse iterator end
           {
@@ -68,18 +68,18 @@ namespace viennacl
             }
             else //_j is valid
             {
-              if (_i < _mat.size() && _mat[i].size() > 0 )
+              if (i_ < mat_.size() && mat_[i].size() > 0 )
               {
                 //TODO: Start at entry j, not at the beginning
-                if (static_cast<int>(_mat[i].rbegin()->first) < j)
-                  iter2 = _mat[i].end();
+                if (static_cast<int>(mat_[i].rbegin()->first) < j)
+                  iter2 = mat_[i].end();
                 else
-                  iter2 = _mat[i].begin();
+                  iter2 = mat_[i].begin();
               }
-              else if (_i < _mat.size() && _mat[i].size() == 0)
-                iter2 = _mat[i].end();
+              else if (i_ < mat_.size() && mat_[i].size() == 0)
+                iter2 = mat_[i].end();
               else //i is out of range -> end iterator requested
-                iter2 = _mat.back().end(); //forward iterator end
+                iter2 = mat_.back().end(); //forward iterator end
             }
           }
         }
@@ -90,9 +90,9 @@ namespace viennacl
           {
             typedef typename std::map<unsigned int, SCALARTYPE>::const_iterator  col_iterator;
             
-            col_iterator colit = _mat[_i].find(_j);
+            col_iterator colit = mat_[i_].find(j_);
 
-            if (colit != _mat[_i].end())
+            if (colit != mat_[i_].end())
               return colit->second;
             return 0.0;
           }
@@ -105,9 +105,9 @@ namespace viennacl
           if (is_iterator1)
           {
             if (is_forward)
-              ++_i;
+              ++i_;
             else
-              --_i;
+              --i_;
           }
           else
             ++iter2;
@@ -120,9 +120,9 @@ namespace viennacl
           if (is_iterator1)
           {
             if (is_forward)
-              _i += offset;
+              i_ += offset;
             else
-              _i -= offset;
+              i_ -= offset;
           }
           else
           {
@@ -135,13 +135,13 @@ namespace viennacl
         bool operator==(self_type const & other) const
         {
           if (is_iterator1)
-            return (_i == other._i);
+            return (i_ == other.i_);
           return (iter2 == other.iter2);
         }
         
         bool operator!=(self_type const & other) const { return !(*this == other); }
         
-        int index1() const { return _i; }
+        int index1() const { return i_; }
         int index2() const
         { 
           if (is_iterator1)
@@ -152,21 +152,21 @@ namespace viennacl
         
         const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true> begin() const
         {
-          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(_mat, _i, 0);
+          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(mat_, i_, 0);
         }
         const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true> end() const
         {
-          int end_ = static_cast<int>(_mat[_i].size());
+          int end_ = static_cast<int>(mat_[i_].size());
           if (end_ > 0)
-            end_ = _mat[_i].rbegin()->first;
-          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(_mat, _i, end_ + 1);
+            end_ = mat_[i_].rbegin()->first;
+          return const_sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1, true>(mat_, i_, end_ + 1);
         }
         
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > const & _mat;
+        std::vector<std::map<unsigned int, SCALARTYPE> > const & mat_;
         typename std::map<unsigned int, SCALARTYPE>::const_iterator iter2;
-        size_type _i;
-        size_type _j;
+        size_type i_;
+        size_type j_;
     };
     
     /** @brief Adapts a constant sparse matrix type made up from std::vector<std::map<unsigned int, SCALARTYPE> > to basic ublas-compatibility.
@@ -185,34 +185,39 @@ namespace viennacl
         typedef std::size_t   size_type;
         
         const_sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat) 
-         : _mat(mat) {};
-        
-        size_type size1() const { return _mat.size(); }
-        size_type size2() const { return _mat.size(); }
+         : mat_(mat), size1_(mat_.size()), size2_(mat_.size()) {};
+
+        const_sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > const & mat, size_type num_rows, size_type num_cols) 
+         : mat_(mat), size1_(num_rows), size2_(num_cols) {};
+         
+        size_type size1() const { return size1_; }
+        size_type size2() const { return size2_; }
         //size_type size2() const { return (_mat.size() > 0) ? _mat.back().size() : 0; }
 
-        const_iterator1 begin1() const { return const_iterator1(_mat, 0, 0); }
-        const_iterator1 end1() const   { return const_iterator1(_mat, size1(), size2()); }
+        const_iterator1 begin1() const { return const_iterator1(mat_, 0, 0); }
+        const_iterator1 end1() const   { return const_iterator1(mat_, size1(), size2()); }
 
-        const_reverse_iterator1 rbegin1() const { return const_reverse_iterator1(_mat, size1() - 1, 0); }
-        const_reverse_iterator1 rend1() const   { return const_reverse_iterator1(_mat, -1, size2()); }
+        const_reverse_iterator1 rbegin1() const { return const_reverse_iterator1(mat_, size1() - 1, 0); }
+        const_reverse_iterator1 rend1() const   { return const_reverse_iterator1(mat_, -1, size2()); }
 
-        const_iterator2 begin2() const { return const_iterator2(_mat, 0, 0); }
-        const_iterator2 end2() const   { return const_iterator2(_mat, size1(), size2()); }
+        const_iterator2 begin2() const { return const_iterator2(mat_, 0, 0); }
+        const_iterator2 end2() const   { return const_iterator2(mat_, size1(), size2()); }
 
         SCALARTYPE operator()(unsigned int i, unsigned int j) const
         {
           typedef typename std::map<unsigned int, SCALARTYPE>::const_iterator  col_iterator;
           
-          col_iterator colit = _mat[i].find(j);
+          col_iterator colit = mat_[i].find(j);
 
-          if (colit != _mat[i].end())
+          if (colit != mat_[i].end())
             return colit->second;
           return 0.0;
         }
 
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > const & _mat;
+        std::vector<std::map<unsigned int, SCALARTYPE> > const & mat_;
+        size_type size1_;
+        size_type size2_;
     };
     
     
@@ -231,9 +236,10 @@ namespace viennacl
       public:
         typedef self_type     iterator1;
         typedef self_type     iterator2;
+        typedef std::size_t   size_type;
         
         sparse_matrix_adapted_iterator(std::vector<std::map<unsigned int, SCALARTYPE> > & mat, int i, int j)
-         : _mat(mat), _i(i), _j(j)
+         : mat_(mat), i_(i), j_(j)
         {
           if (i < 0) //reverse iterator end
           {
@@ -247,18 +253,18 @@ namespace viennacl
             }
             else //_j is valid
             {
-              if (_i < _mat.size() && _mat[i].size() > 0 )
+              if (i_ < mat_.size() && mat_[i].size() > 0 )
               {
                 //TODO: Start at entry j, not at the beginning
-                if (static_cast<int>(_mat[i].rbegin()->first) < j)
-                  iter2 = _mat[i].end();
+                if (static_cast<int>(mat_[i].rbegin()->first) < j)
+                  iter2 = mat_[i].end();
                 else
-                  iter2 = _mat[i].begin();
+                  iter2 = mat_[i].begin();
               }
-              else if (_i < _mat.size() && _mat[i].size() == 0)
-                iter2 = _mat[i].end();
+              else if (i_ < mat_.size() && mat_[i].size() == 0)
+                iter2 = mat_[i].end();
               else //i is out of range -> end iterator requested
-                iter2 = _mat.back().end(); //forward iterator end
+                iter2 = mat_.back().end(); //forward iterator end
             }
           }
         }
@@ -267,7 +273,7 @@ namespace viennacl
         {
           if (is_iterator1)
           {
-            return _mat[_i][_j];
+            return mat_[i_][j_];
           }
           else
             return iter2->second;
@@ -276,20 +282,20 @@ namespace viennacl
         self_type & operator++(void)
         {
           if (is_iterator1)
-            ++_i;
+            ++i_;
           else
             ++iter2;
           return *this;
         }
         self_type & operator++(int) { self_type tmp = *this; ++(*this); return tmp; }
         
-        self_type operator+=(unsigned int offset)
+        self_type operator+=(size_type offset)
         {
           if (is_iterator1)
-            _i += offset;
+            i_ += offset;
           else
           {
-            for (unsigned int k=0; k<offset; ++k)
+            for (size_type k=0; k<offset; ++k)
               ++iter2;  //Note: User must ensure that this is always valid...
           }
           return *this;
@@ -298,12 +304,12 @@ namespace viennacl
         bool operator==(self_type const & other) const
         {
           if (is_iterator1)
-            return (_i == other._i);
+            return (i_ == other.i_);
           return (iter2 == other.iter2);
         }
         bool operator!=(self_type const & other) const { return !(*this == other); }
         
-        unsigned int index1() const { return _i; }
+        unsigned int index1() const { return i_; }
         unsigned int index2() const
         { 
           if (is_iterator1)
@@ -314,21 +320,21 @@ namespace viennacl
         
         sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1> begin() const
         {
-          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(_mat, _i, 0);
+          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(mat_, i_, 0);
         }
         sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1> end() const
         {
-          int end_ = static_cast<int>(_mat[_i].size());
+          int end_ = static_cast<int>(mat_[i_].size());
           if (end_ > 0)
-            end_ = _mat[_i].rbegin()->first;
-          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(_mat, _i, end_ + 1);
+            end_ = mat_[i_].rbegin()->first;
+          return sparse_matrix_adapted_iterator<SCALARTYPE, !is_iterator1>(mat_, i_, end_ + 1);
         }
         
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > & _mat;
+        std::vector<std::map<unsigned int, SCALARTYPE> > & mat_;
         typename std::map<unsigned int, SCALARTYPE>::iterator iter2;
-        unsigned int _i;
-        unsigned int _j;
+        size_type i_;
+        size_type j_;
     };
     
     
@@ -344,41 +350,52 @@ namespace viennacl
       public:
         typedef sparse_matrix_adapted_iterator<SCALARTYPE, true>      iterator1;
         typedef sparse_matrix_adapted_iterator<SCALARTYPE, false>     iterator2;
+        typedef std::size_t                                           size_type;
         
         sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > & mat) 
-         : BaseType(mat), _mat(mat) { };
-        
-        iterator1 begin1() { return iterator1(_mat, 0, 0); }
-        iterator1 end1() { return iterator1(_mat, _mat.size(), _mat.back().size()); }
+         : BaseType(mat), mat_(mat), size1_(mat_.size()), size2_(mat_.size()) { };
 
-        iterator2 begin2() { return iterator2(_mat, 0, 0); }
-        iterator2 end2() { return iterator2(_mat, _mat.size(), _mat.back().size()); }
+        sparse_matrix_adapter(std::vector<std::map<unsigned int, SCALARTYPE> > & mat,
+                              std::size_t num_rows,
+                              std::size_t num_cols) 
+         : BaseType(mat, num_rows, num_cols), mat_(mat), size1_(num_rows), size2_(num_cols) { };
+         
+        iterator1 begin1() { return iterator1(mat_, 0, 0); }
+        iterator1 end1() { return iterator1(mat_, mat_.size(), mat_.back().size()); }
+
+        iterator2 begin2() { return iterator2(mat_, 0, 0); }
+        iterator2 end2() { return iterator2(mat_, mat_.size(), mat_.back().size()); }
         
-        SCALARTYPE & operator()(unsigned int i, unsigned int j) { return _mat[i][j]; }
+        SCALARTYPE & operator()(size_type i, size_type j) { return mat_[i][j]; }
         
-        void resize(unsigned int i, unsigned int j, bool preserve = true)
+        void resize(size_type i, size_type j, bool preserve = true)
         {
           if (i>0)
-            _mat.resize(i);
+            mat_.resize(i);
           if (!preserve)
             clear();
+          
+          size1_ = i;
+          size2_ = j;
         }
         
         void clear()
         {
-          for (unsigned int i=0; i<_mat.size(); ++i)
-            _mat[i].clear();
+          for (size_type i=0; i<mat_.size(); ++i)
+            mat_[i].clear();
         }
         
-        size_t size1() { return _mat.size(); }
-        size_t size1() const { return _mat.size(); } //Note: Due to name hiding it is not sufficient to have it in the base class
+        size_type size1() { return size1_; }
+        size_type size1() const { return size1_; } //Note: Due to name hiding it is not sufficient to have it in the base class
         
         //assume a square matrix
-        size_t size2() { return (_mat.size() > 0) ? (_mat.back().size() > 0 ? _mat.back().size() : _mat.size()) : 0; }
-        size_t size2() const { return (_mat.size() > 0) ? (_mat.back().size() > 0 ? _mat.back().size() : _mat.size()) : 0; } //Note: Due to name hiding it is not sufficient to have it in the base class
+        size_type size2() { return size2_; }
+        size_type size2() const { return size2_; } //Note: Due to name hiding it is not sufficient to have it in the base class
         
       private:
-        std::vector<std::map<unsigned int, SCALARTYPE> > & _mat;
+        std::vector<std::map<unsigned int, SCALARTYPE> > & mat_;
+        size_type size1_;
+        size_type size2_;
     };
     
 
diff --git a/viennacl/tools/entry_proxy.hpp b/viennacl/tools/entry_proxy.hpp
index af5d7fa..f7d4543 100644
--- a/viennacl/tools/entry_proxy.hpp
+++ b/viennacl/tools/entry_proxy.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_ENTRY_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -103,7 +103,7 @@ namespace viennacl
         */
         entry_proxy & operator=(scalar<SCALARTYPE> const & value)
         {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), value.handle(), _mem_handle, 0, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), 0, NULL, NULL);
+          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), value.handle().get(), _mem_handle.get(), 0, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), 0, NULL, NULL);
           //assert(err == CL_SUCCESS);
           VIENNACL_ERR_CHECK(err);
           return *this;
@@ -113,9 +113,9 @@ namespace viennacl
         */
         entry_proxy &  operator=(entry_proxy const & other)
         {
-          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(),
-                                           other._mem_handle, //src
-                                           _mem_handle,       //dest
+          cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
+                                           other._mem_handle.get(), //src
+                                           _mem_handle.get(),       //dest
                                            sizeof(SCALARTYPE) * other._index, //offset src
                                            sizeof(SCALARTYPE) * _index,       //offset dest
                                            sizeof(SCALARTYPE), 0, NULL, NULL);
@@ -153,7 +153,7 @@ namespace viennacl
         {
           SCALARTYPE temp;
           cl_int err;
-          err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), _mem_handle, CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &temp, 0, NULL, NULL);
+          err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(), _mem_handle.get(), CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &temp, 0, NULL, NULL);
           //assert(err == CL_SUCCESS);
           VIENNACL_ERR_CHECK(err);
           viennacl::ocl::get_queue().finish();
@@ -165,7 +165,7 @@ namespace viennacl
         void write(SCALARTYPE value)
         {
           cl_int err;
-          err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), _mem_handle, CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &value, 0, NULL, NULL);
+          err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), _mem_handle.get(), CL_TRUE, sizeof(SCALARTYPE)*_index, sizeof(SCALARTYPE), &value, 0, NULL, NULL);
           //assert(err == CL_SUCCESS);
           VIENNACL_ERR_CHECK(err);
         }
diff --git a/viennacl/tools/matrix_kernel_class_deducer.hpp b/viennacl/tools/matrix_kernel_class_deducer.hpp
index 0cd256d..898a779 100644
--- a/viennacl/tools/matrix_kernel_class_deducer.hpp
+++ b/viennacl/tools/matrix_kernel_class_deducer.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_MATRIX_KERNEL_CLASS_DEDUCER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp b/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
index 9438be2..6905537 100644
--- a/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
+++ b/viennacl/tools/matrix_prod_kernel_class_deducer.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_MATRIX_PROD_KERNEL_CLASS_DEDUCER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -41,10 +41,52 @@ namespace viennacl
 {
   namespace tools
   {
+    namespace detail
+    {
+      template <typename MatrixType>
+      struct extract_matrix
+      {
+        typedef typename MatrixType::ERROR_UNKNOWN_MATRIX_TYPE_PROVIDED   error_type;
+      };
+      
+      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+      struct extract_matrix < viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+      {
+        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
+      };
+
+      template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
+      struct extract_matrix < const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> >
+      {
+        typedef viennacl::matrix<SCALARTYPE, F, ALIGNMENT>   type;
+      };
+
+      
+      template <typename MatrixType>
+      struct extract_matrix < viennacl::matrix_range<MatrixType> >
+      {
+        typedef typename extract_matrix<MatrixType>::type   type;
+      };
+
+      template <typename MatrixType>
+      struct extract_matrix < const viennacl::matrix_range<MatrixType> >
+      {
+        typedef typename extract_matrix<MatrixType>::type   type;
+      };
+      
+      
+    }
+    
+    
+    
     /** @brief deduces kernel type for C=A*B, where A, B, C are MatrixType1, MatrixType2 and MatrixType3 respectively */
     template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
     struct MATRIX_PROD_KERNEL_CLASS_DEDUCER
-    {};
+    {
+      typedef typename MATRIX_PROD_KERNEL_CLASS_DEDUCER< typename detail::extract_matrix<MatrixType1>::type,
+                                                         typename detail::extract_matrix<MatrixType2>::type,
+                                                         typename detail::extract_matrix<MatrixType3>::type>::ResultType   ResultType;
+    };
     
     template <typename SCALARTYPE, unsigned int ALIGNMENT>
     struct MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<SCALARTYPE, viennacl::row_major, ALIGNMENT>,
diff --git a/viennacl/tools/matrix_size_deducer.hpp b/viennacl/tools/matrix_size_deducer.hpp
index b572a5a..0f6e564 100644
--- a/viennacl/tools/matrix_size_deducer.hpp
+++ b/viennacl/tools/matrix_size_deducer.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_MATRIX_SIZE_DEDUCER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -69,36 +69,72 @@ namespace viennacl
       static unsigned int size(MatrixType & lhs, const viennacl::vector<ScalarType, A> & rhs) { return lhs.size1(); }
     };*/
 
-    template <typename ScalarType, typename F1, unsigned int A1, typename F2, unsigned int A2>
-    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<const viennacl::matrix<ScalarType, F1, A1>,
-                                                                 const viennacl::matrix<ScalarType, F1, A1>, op_trans>,
+    // A^T * B
+    template <typename ScalarType, typename T1, typename F2, unsigned int A2>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
+                                                                 T1, op_trans>,
                                const viennacl::matrix<ScalarType, F2, A2>,
                                viennacl::op_prod>
     {
-      static size_t size1(viennacl::matrix_expression<const viennacl::matrix<ScalarType, F1, A1>,
-                                                      const viennacl::matrix<ScalarType, F1, A1>,
-                                                      op_trans> const & lhs,
-                          viennacl::matrix<ScalarType, F2, A2> const & rhs) { return lhs.lhs().size2(); }
-      static size_t size2(viennacl::matrix_expression<const viennacl::matrix<ScalarType, F1, A1>,
-                                                      const viennacl::matrix<ScalarType, F1, A1>,
-                                                      op_trans> const & lhs,
-                          viennacl::matrix<ScalarType, F2, A2> const & rhs) { return rhs.size2(); }
+      static std::size_t size1(viennacl::matrix_expression<T1,
+                                                           T1,
+                                                           op_trans> const & lhs,
+                               viennacl::matrix<ScalarType, F2, A2> const & rhs) { return lhs.lhs().size2(); }
+      static std::size_t size2(viennacl::matrix_expression<T1,
+                                                           T1,
+                                                           op_trans> const & lhs,
+                               viennacl::matrix<ScalarType, F2, A2> const & rhs) { return rhs.size2(); }
     };
+
+    template <typename T1, typename MatrixType2>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_expression<T1,
+                                                                 T1, op_trans>,
+                               const viennacl::matrix_range<MatrixType2>,
+                               viennacl::op_prod>
+    {
+      static std::size_t size1(viennacl::matrix_expression<T1,
+                                                           T1,
+                                                           op_trans> const & lhs,
+                               viennacl::matrix_range<MatrixType2> const & rhs) { return lhs.lhs().size2(); }
+      static std::size_t size2(viennacl::matrix_expression<T1,
+                                                           T1,
+                                                           op_trans> const & lhs,
+                               viennacl::matrix_range<MatrixType2> const & rhs) { return rhs.size2(); }
+    };
+    
     
-    template <typename ScalarType, typename F1, unsigned int A1, typename F2, unsigned int A2>
+    // A * B^T 
+    
+    template <typename ScalarType, typename F1, unsigned int A1, typename T2>
     struct MATRIX_SIZE_DEDUCER<const viennacl::matrix<ScalarType, F1, A1>,
-                               const viennacl::matrix_expression<const viennacl::matrix<ScalarType, F2, A2>,
-                                                                 const viennacl::matrix<ScalarType, F2, A2>, op_trans>,
+                               const viennacl::matrix_expression<T2,
+                                                                 T2, op_trans>,
+                               viennacl::op_prod>
+    {
+      static std::size_t size1(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+                               viennacl::matrix_expression<T2,
+                                                           T2,
+                                                           op_trans> const & rhs) { return lhs.size1(); }
+      static std::size_t size2(viennacl::matrix<ScalarType, F1, A1> const & lhs,
+                               viennacl::matrix_expression<T2,
+                                                           T2,
+                                                           op_trans> const & rhs) { return rhs.lhs().size1(); }
+    };
+
+    template <typename MatrixType1, typename T2>
+    struct MATRIX_SIZE_DEDUCER<const viennacl::matrix_range<MatrixType1>,
+                               const viennacl::matrix_expression<T2,
+                                                                 T2, op_trans>,
                                viennacl::op_prod>
     {
-      static size_t size1(viennacl::matrix<ScalarType, F1, A1> const & lhs,
-                          viennacl::matrix_expression<const viennacl::matrix<ScalarType, F2, A2>,
-                                                      const viennacl::matrix<ScalarType, F2, A2>,
-                                                      op_trans> const & rhs) { return lhs.size1(); }
-      static size_t size2(viennacl::matrix<ScalarType, F1, A1> const & lhs,
-                          viennacl::matrix_expression<const viennacl::matrix<ScalarType, F2, A2>,
-                                                      const viennacl::matrix<ScalarType, F2, A2>,
-                                                      op_trans> const & rhs) { return rhs.lhs().size1(); }
+      static std::size_t size1(viennacl::matrix_range<MatrixType1> const & lhs,
+                               viennacl::matrix_expression<T2,
+                                                           T2,
+                                                           op_trans> const & rhs) { return lhs.size1(); }
+      static std::size_t size2(viennacl::matrix_range<MatrixType1> const & lhs,
+                               viennacl::matrix_expression<T2,
+                                                           T2,
+                                                           op_trans> const & rhs) { return rhs.lhs().size1(); }
     };
     
   }
diff --git a/viennacl/tools/matrix_solve_kernel_class_deducer.hpp b/viennacl/tools/matrix_solve_kernel_class_deducer.hpp
index 31dcc9b..d9694c2 100644
--- a/viennacl/tools/matrix_solve_kernel_class_deducer.hpp
+++ b/viennacl/tools/matrix_solve_kernel_class_deducer.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_MATRIX_SOLVE_KERNEL_CLASS_DEDUCER_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/tools/tools.hpp b/viennacl/tools/tools.hpp
index 43699ba..243e90c 100644
--- a/viennacl/tools/tools.hpp
+++ b/viennacl/tools/tools.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TOOLS_TOOLS_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -147,14 +147,10 @@ namespace viennacl
     * @param platform_info   An info string that contains the OpenCL platform vendor
     * @return   The double precision kernel
     */
-    inline std::string make_double_kernel(std::string const & source, std::string platform_info)
-    //inline std::string make_double_kernel(std::string const & source)
+    inline std::string make_double_kernel(std::string const & source, std::string const & fp_extension)
     {
       std::stringstream ss;
-      if (platform_info.compare(0, 8, "Advanced") == 0)  //double precision in Stream SDK is enabled by a non-standard pragma
-        ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n\n";
-      else
-        ss << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n";
+      ss << "#pragma OPENCL EXTENSION " << fp_extension << " : enable\n\n";
       
       std::string result = ss.str();
       result.append(strReplace(source, "float", "double"));
diff --git a/viennacl/traits/clear.hpp b/viennacl/traits/clear.hpp
index d6c3479..14f547b 100644
--- a/viennacl/traits/clear.hpp
+++ b/viennacl/traits/clear.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TRAITS_CLEAR_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/traits/fill.hpp b/viennacl/traits/fill.hpp
index 21e6636..32afed0 100644
--- a/viennacl/traits/fill.hpp
+++ b/viennacl/traits/fill.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TRAITS_FILL_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/traits/handle.hpp b/viennacl/traits/handle.hpp
index 1d08741..a5ea9b3 100644
--- a/viennacl/traits/handle.hpp
+++ b/viennacl/traits/handle.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TRAITS_HANDLE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/traits/size.hpp b/viennacl/traits/size.hpp
index b9a7da5..59b3f1f 100644
--- a/viennacl/traits/size.hpp
+++ b/viennacl/traits/size.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TRAITS_SIZE_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/traits/start.hpp b/viennacl/traits/start.hpp
index eac447b..f2364fb 100644
--- a/viennacl/traits/start.hpp
+++ b/viennacl/traits/start.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_TRAITS_START_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/vandermonde_matrix.hpp b/viennacl/vandermonde_matrix.hpp
index 6da7c9c..d97929b 100644
--- a/viennacl/vandermonde_matrix.hpp
+++ b/viennacl/vandermonde_matrix.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_VANDERMONDE_MATRIX_HPP
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
diff --git a/viennacl/vector.hpp b/viennacl/vector.hpp
index fd5971a..8ae4981 100644
--- a/viennacl/vector.hpp
+++ b/viennacl/vector.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_VECTOR_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -229,7 +229,7 @@ namespace viennacl
         if (size_ < internal_size())
         {
           std::vector<SCALARTYPE> temp(internal_size() - size_);
-          cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), elements_, CL_TRUE, sizeof(SCALARTYPE)*size_, sizeof(SCALARTYPE)*(internal_size() - size_), &(temp[0]), 0, NULL, NULL);
+          cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), elements_.get(), CL_TRUE, sizeof(SCALARTYPE)*size_, sizeof(SCALARTYPE)*(internal_size() - size_), &(temp[0]), 0, NULL, NULL);
           //assert(err == CL_SUCCESS);
           VIENNACL_ERR_CHECK(err);
         }
@@ -268,7 +268,7 @@ namespace viennacl
         {
           elements_ = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(SCALARTYPE)*internal_size());
           cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), vec.handle(), elements_, 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
           //assert(err == CL_SUCCESS);
           VIENNACL_ERR_CHECK(err);
         }
@@ -282,7 +282,7 @@ namespace viennacl
         if (size() != 0)
         {
           cl_int err;
-          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(), vec.handle(), elements_, 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
+          err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(), vec.handle().get(), elements_.get(), 0, 0, sizeof(SCALARTYPE)*internal_size(), 0, NULL, NULL);
           VIENNACL_ERR_CHECK(err);
         }
         return *this;
@@ -1257,8 +1257,8 @@ namespace viennacl
       if (gpu_end - gpu_begin != 0)
       {
         std::vector<SCALARTYPE> temp_buffer(gpu_end - gpu_begin);
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_begin.handle(), CL_TRUE, 0, 
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_begin.handle().get(), CL_TRUE, 0, 
                                          sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
                                          &(temp_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
@@ -1317,8 +1317,8 @@ namespace viennacl
     {
       if (gpu_begin != gpu_end)
       {
-        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                         gpu_begin.handle(), CL_TRUE, 0,
+        cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                         gpu_begin.handle().get(), CL_TRUE, 0,
                                          sizeof(SCALARTYPE)*(gpu_end - gpu_begin),
                                          &(*cpu_begin), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
@@ -1379,8 +1379,8 @@ namespace viennacl
         //we require that the size of the gpu_vector is larger or equal to the cpu-size
         std::vector<SCALARTYPE> temp_buffer(cpu_end - cpu_begin);
         std::copy(cpu_begin, cpu_end, temp_buffer.begin());
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_begin.handle(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.index(),
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_begin.handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_begin.index(),
                                           sizeof(SCALARTYPE)*(cpu_end - cpu_begin),
                                           &(temp_buffer[0]), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
@@ -1426,8 +1426,8 @@ namespace viennacl
       if (cpu_begin != cpu_end)
       {
         //we require that the size of the gpu_vector is larger or equal to the cpu-size
-        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(), 
-                                          gpu_begin.handle(), CL_TRUE, 0, 
+        cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(), 
+                                          gpu_begin.handle().get(), CL_TRUE, 0, 
                                           sizeof(SCALARTYPE)*(cpu_end - cpu_begin), &(*cpu_begin), 0, NULL, NULL);
         VIENNACL_ERR_CHECK(err);
       }
@@ -1486,9 +1486,9 @@ namespace viennacl
       assert(gpu_src_end - gpu_src_begin >= 0);
       if (gpu_src_begin != gpu_src_end)
       {
-        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle(),
-                                          gpu_src_begin.handle(),  //src handle
-                                          gpu_dest_begin.handle(), //dest handle
+        cl_int err = clEnqueueCopyBuffer(viennacl::ocl::get_queue().handle().get(),
+                                          gpu_src_begin.handle().get(),  //src handle
+                                          gpu_dest_begin.handle().get(), //dest handle
                                           sizeof(SCALARTYPE) * gpu_src_begin.index(), //src offset
                                           sizeof(SCALARTYPE) * gpu_dest_begin.index(), //dest offset
                                           sizeof(SCALARTYPE) * (gpu_src_end.index() - gpu_src_begin.index()), //data length
diff --git a/viennacl/vector_proxy.hpp b/viennacl/vector_proxy.hpp
index b8ca885..d138472 100644
--- a/viennacl/vector_proxy.hpp
+++ b/viennacl/vector_proxy.hpp
@@ -2,7 +2,7 @@
 #define VIENNACL_VECTOR_PROXY_HPP_
 
 /* =========================================================================
-   Copyright (c) 2010-2011, Institute for Microelectronics,
+   Copyright (c) 2010-2012, Institute for Microelectronics,
                             Institute for Analysis and Scientific Computing,
                             TU Wien.
 
@@ -109,8 +109,8 @@ namespace viennacl
       //we require that the size of the gpu_vector is larger or equal to the cpu-size
       std::vector<SCALARTYPE> temp_buffer(cpu_vector.end() - cpu_vector.begin());
       std::copy(cpu_vector.begin(), cpu_vector.end(), temp_buffer.begin());
-      cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle(),
-                                        gpu_vector_range.get().handle(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(),
+      cl_int err = clEnqueueWriteBuffer(viennacl::ocl::get_queue().handle().get(),
+                                        gpu_vector_range.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(),
                                         sizeof(SCALARTYPE)*temp_buffer.size(),
                                         &(temp_buffer[0]), 0, NULL, NULL);
       VIENNACL_ERR_CHECK(err);
@@ -132,8 +132,8 @@ namespace viennacl
     if (cpu_vector.end() > cpu_vector.begin())
     {
       std::vector<SCALARTYPE> temp_buffer(cpu_vector.end() - cpu_vector.begin());
-      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
-                                        gpu_vector_range.get().handle(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(), 
+      cl_int err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle().get(),
+                                        gpu_vector_range.get().handle().get(), CL_TRUE, sizeof(SCALARTYPE)*gpu_vector_range.start(), 
                                         sizeof(SCALARTYPE)*temp_buffer.size(),
                                         &(temp_buffer[0]), 0, NULL, NULL);
       VIENNACL_ERR_CHECK(err);

-- 
ViennaCL packaging



More information about the debian-science-commits mailing list