diff --git a/src/cuda/AlignedAllocation.h b/src/cuda/AlignedAllocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..20e44be920e177fb6690abf31d19f9f74169a12d
--- /dev/null
+++ b/src/cuda/AlignedAllocation.h
@@ -0,0 +1,90 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignedAllocation.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "cuda/ErrorChecking.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+
+namespace walberla {
+namespace cuda {
+
+static std::map<void *, void*> freePointers_;
+
+
+inline void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
+{
+   // With 0 alignment this function makes no sense
+   // use normal malloc instead
+   WALBERLA_ASSERT_GREATER( alignment, 0 );
+   // Tests if alignment is power of two (assuming alignment>0)
+   WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+
+   WALBERLA_ASSERT_LESS( offset, alignment );
+
+   if( offset == 0 )
+   {
+      void * result = nullptr;
+      WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
+      freePointers_[result] = result;
+      return result;
+   }
+
+   void *pa;  // pointer to allocated memory
+   void *ptr; // pointer to usable aligned memory
+
+   WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
+   WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
+   ptr = (void *) ((char *) (pa) + alignment - offset);
+   freePointers_[ptr] = pa;
+
+   WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 );
+   return ptr;
+}
+
+
+inline void free_aligned_with_offset( void *ptr )
+{
+   // assume that pointer to real allocated chunk is stored just before
+   // chunk that was given to user
+   WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
+   freePointers_.erase(ptr);
+}
+
+
+
+inline void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
+                                           size_t alignment, size_t alignmentOffset )
+{
+   if( width % alignment == 0)
+      pitchOut = width;
+   else
+      pitchOut = ((width + alignment) / alignment ) * alignment;
+
+   WALBERLA_ASSERT_GREATER_EQUAL( pitchOut, width );
+   WALBERLA_ASSERT_EQUAL( pitchOut  % alignment, 0 );
+
+   return allocate_aligned_with_offset( pitchOut * height, alignment, alignmentOffset );
+}
+
+} // namespace cuda
+} // namespace walberla
+
diff --git a/src/cuda/GPUField.impl.h b/src/cuda/GPUField.impl.h
index e5b49c7d7bf537ce9723071601886fb39537f141..702af9b667c439dcd2cff4e074348222844aec19 100644
--- a/src/cuda/GPUField.impl.h
+++ b/src/cuda/GPUField.impl.h
@@ -21,7 +21,7 @@
 
 #include "GPUField.h"
 #include "ErrorChecking.h"
-
+#include "AlignedAllocation.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
@@ -51,7 +51,12 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
 
    if ( usePitchedMem_ )
    {
-      WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) );
+      size_t pitch;
+      const size_t alignment = 256;
+      void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
+                                                 sizeof(T) * nrOfGhostLayers_ );
+      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
+      pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
    }
    else
    {
@@ -79,9 +84,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
 template<typename T>
 GPUField<T>::~GPUField()
 {
-   cudaFree( pitchedPtr_.ptr );
+   if( usePitchedMem_ )
+      free_aligned_with_offset(pitchedPtr_.ptr );
+   else
+   {
+      WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
+   }
 }
 
+
 template<typename T>
 T * GPUField<T>::dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f)
 {
diff --git a/src/field/allocation/AlignedMalloc.cpp b/src/field/allocation/AlignedMalloc.cpp
index 578bcfd9f456c41cf6e01066ca5a780ac2e985dd..d7e4409d5cff40fe3bf7c09826129cf8cd488f56 100644
--- a/src/field/allocation/AlignedMalloc.cpp
+++ b/src/field/allocation/AlignedMalloc.cpp
@@ -41,7 +41,7 @@ namespace field {
       void *pa;  // pointer to allocated memory
       void *ptr; // pointer to usable aligned memory
 
-      pa=std::malloc((size+alignment-1)+sizeof(void *));
+      pa = std::malloc((size + alignment - 1) + sizeof( void * ));
       if(!pa)
          return nullptr;
 
diff --git a/tests/cuda/AlignmentTest.cpp b/tests/cuda/AlignmentTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3de12c7628c98da8797d442f6236d7893829788e
--- /dev/null
+++ b/tests/cuda/AlignmentTest.cpp
@@ -0,0 +1,50 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignmentTest.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "cuda/AlignedAllocation.h"
+#include "core/mpi/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+
+
+using namespace walberla;
+using namespace cuda;
+
+
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+
+   size_t pitch = 0;
+   size_t width = 7;
+   size_t height = 20;
+   size_t alignment = 512;
+   size_t offset = 16;
+   void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
+   WALBERLA_LOG_INFO("Pitch " << pitch);
+
+   char * cptr = reinterpret_cast<char*>( ptr );
+   WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
+
+   free_aligned_with_offset( ptr );
+
+   return 0;
+}
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 39222ebdeca1f35fd58aabd94ca13c2795b9e3d7..dac2f4a1c3a34e54fbeaa6eedfd894d65be0acce 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -32,6 +32,10 @@ waLBerla_compile_test( FILES communication/CommTest )
 waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
 #waLBerla_execute_test( NAME  CudaMPI )
 
+waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop )
+
+waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
+
 waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
                           FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
                           DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/cuda/FieldTransferTest.cpp
index 4cdd11b9f5ede14ec9c07a7ad5a3465b8a93df98..7a41330a23ab24e7aaeb2055efe9155dc3aa4ca2 100644
--- a/tests/cuda/FieldTransferTest.cpp
+++ b/tests/cuda/FieldTransferTest.cpp
@@ -26,23 +26,27 @@
 
 #include "cuda/GPUField.h"
 #include "cuda/FieldCopy.h"
+#include "core/math/Random.h"
 
 
 using namespace walberla;
 
 void simpleTransfer()
 {
-   Field<double,4>  h_f1 ( 16, 20, 30, 42.0, field::fzyx );
-   Field<double,4>  h_f2 ( 16, 20, 30,  0.0, field::fzyx );
+   Field<double, 4> h_f1( 16, 20, 30, 42.0, field::fzyx );
+   Field<double, 4> h_f2( 16, 20, 30, 0.0, field::fzyx );
 
+   WALBERLA_FOR_ALL_CELLS_XYZ(&h_f1,
+      h_f1(x, y, z, 0) = math::realRandom<double>();
+   )
 
-   cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx );
+   cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
 
-   WALBERLA_CHECK_EQUAL(  h_f1.xSize() ,d_f.xSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.ySize() ,d_f.ySize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.zSize() ,d_f.zSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.fSize() ,d_f.fSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.layout(),d_f.layout()     );
+   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
+   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
+   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
+   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
+   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
 
 
    cuda::fieldCpy( d_f, h_f1 );
@@ -52,9 +56,7 @@ void simpleTransfer()
 }
 
 
-
-
-int main( int argc, char ** argv )
+int main( int argc, char **argv )
 {
    debug::enterTestMode();
    walberla::Environment walberlaEnv( argc, argv );
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8516c09332d071270719b503c0cdc962acb427ca
--- /dev/null
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
@@ -0,0 +1,66 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MicroBenchmarkPdfCopy.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Environment.h"
+#include "blockforest/Initialization.h"
+
+#include "field/Field.h"
+
+#include "cuda/GPUField.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/AddGPUFieldToStorage.h"
+
+#include "MicroBenchmarkCopyKernel.h"
+#include "MicroBenchmarkStreamKernel.h"
+
+
+using namespace walberla;
+
+
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+
+   shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
+           128u, 128u, 128u, 1.0, false, false, false, false);
+
+   BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "src", 19, field::fzyx, 1);
+   BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "dst", 19, field::fzyx, 1);
+
+   int iterations = 3;
+
+   pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         copy( &block );
+
+
+   pystencils::MicroBenchmarkStreamKernel stream(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         stream( &block );
+
+   WALBERLA_CUDA_CHECK(cudaDeviceSynchronize());
+
+   return 0;
+}
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b722f0510b95e9b8e01d6bddd5947ef97f2a74c9
--- /dev/null
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
@@ -0,0 +1,27 @@
+import pystencils as ps
+from pystencils_walberla.sweep import Sweep
+from lbmpy.updatekernels import create_stream_pull_only_kernel
+from lbmpy.stencils import get_stencil
+
+dtype = 'float64'
+f_size = 19
+
+
+def copy_only():
+    src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size),
+                         layout='fzyx')
+    return [ps.Assignment(dst(i), src(i)) for i in range(f_size)]
+
+
+def stream_only():
+    stencil = get_stencil("D3Q19")
+    return create_stream_pull_only_kernel(stencil, src_field_name='src',
+                                          dst_field_name='dst',
+                                          generic_field_type=dtype,
+                                          generic_layout='fzyx')
+
+
+opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
+
+Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt)
+Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt)