diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94c89858523f668f989bdbd0c2dd8764fdcbda6f..c8e6947b5ab0b95d5016e45e749a91a607992e6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,11 @@
   - Add support for more shapes, e.g., convex polyhedron
 - MESA_PD:
    - Add extensive application for dense particle packing generation
+- AMD - HIP support
+  - Support of the ROCm Toolchain and thus AMD HIP as second GPU language
+  - All CUDA related files, namespaces, folders etc are renamed to gpu.
+  - Include "GPUWrapper.h" to use general GPU functions cudaMalloc -> gpuMalloc
+  - WALBERLA_BUILD_WITH_HIP and WALBERLA_BUILD_WITH_GPU_SUPPORT as new CMake variables introduced
 
 ### Changed
 - Update and extend phase-field LBM showcases
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28857123a654e2f23c8794936f0bad931fb8b4f0..429e36ea2c67d660fb5428c5bfa8960a4fce70a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,7 @@ option ( WALBERLA_BUILD_WITH_CODEGEN        "Enable pystencils code generation"
 option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr"              )
 
 option ( WALBERLA_BUILD_WITH_CUDA	        "Enable CUDA support"                                )
+option ( WALBERLA_BUILD_WITH_HIP	           "Enable ROCm HIP support"                            )
 
 
 option ( WALBERLA_BUILD_WITH_FASTMATH       "Fast math"                                          )
@@ -219,8 +220,7 @@ else()
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_NEC )
 
-# Check for Clang compiler
-if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER MATCHES "hipcc" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
     option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" ON  )
 else()
     option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" OFF  )
@@ -1081,6 +1081,10 @@ endif()
 ##
 ############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CUDA )
+    if (WALBERLA_BUILD_WITH_HIP)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
+    endif()
+
     include(CheckLanguage)
     check_language(CUDA)
     if( CMAKE_CUDA_COMPILER )
@@ -1102,6 +1106,7 @@ if ( WALBERLA_BUILD_WITH_CUDA )
 
         #CUDA_FOUND is need for our cmake mechanism
         set ( CUDA_FOUND TRUE )
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
     else()
         message( WARNING "CUDA could not be enabled. The host compiler might not be compatible. Check CMakeFiles/CMakeError.log for more information" )
         set ( WALBERLA_BUILD_WITH_CUDA FALSE )
@@ -1129,6 +1134,38 @@ endif()
 
 
 
+############################################################################################################################
+##
+## ROCm HIP
+##
+############################################################################################################################
+if ( WALBERLA_BUILD_WITH_HIP )
+    if (WALBERLA_BUILD_WITH_CUDA)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
+    endif()
+    if (${CMAKE_VERSION} VERSION_LESS "3.21.0")
+       message(FATAL_ERROR "For HIP support CMake > 3.21.0 is needed. Please install a newer version")
+    endif()
+
+    include(CheckLanguage)
+    check_language(HIP)
+
+    if( CMAKE_HIP_COMPILER )
+        enable_language(HIP)
+        # since waLBerla also supports CUDA we only use HIP on an AMD platform
+        add_compile_definitions(__HIP_PLATFORM_AMD__)
+        # include_directories(${HSA_HEADER})
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
+    else()
+         message("HIP compiler not found. HIP support is not possible")
+        set ( WALBERLA_BUILD_WITH_HIP FALSE )
+    endif ( )
+endif ( )
+
+############################################################################################################################
+
+
+
 ############################################################################################################################
 ##
 ##  Testing Coverage
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 3f5e6a95a0f0c6a8199de013da440e37e7f78afb..4b95602d6daca9adec9a4932e4f12707f1fb0878 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -27,7 +27,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
       add_subdirectory( PhaseFieldAllenCahn )
    endif()
 
-   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_CUDA )
+   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
       add_subdirectory( UniformGridGPU )
    endif()
 
diff --git a/apps/benchmarks/CouetteFlow/CouetteFlow.cpp b/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
index b313738d067f0f0ad6501f4e1776d3e0ed9f2bf9..1f3ea1b7dfade52e614a916a40261415e6874984 100644
--- a/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
+++ b/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
@@ -773,7 +773,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
index faaaf44dc20de92da16b0333a08ab26886ab9fd7..4010341a3d5a4ba93558eae60e95f2fcd292bcbc 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
@@ -12,9 +12,9 @@ waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
         FlowAroundSphereCodeGen_InfoHeader.h)
 
 if (WALBERLA_BUILD_WITH_CUDA)
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core cuda domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
+        waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
+                DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
 else ()
-    waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-endif (WALBERLA_BUILD_WITH_CUDA)
\ No newline at end of file
+        waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
+                DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
+endif (WALBERLA_BUILD_WITH_CUDA)
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
index bdd1ccbe13995ea6632c6013fd9e54dae9b1f6bc..08e5de4d928d9a23e76104823721dfbb5d811f69 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -37,13 +37,13 @@
 #include "timeloop/all.h"
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
-#   include "cuda/NVTX.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/HostFieldAllocator.h"
+#   include "gpu/NVTX.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/GPUPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #endif
 
 // CodeGen includes
@@ -58,7 +58,7 @@ typedef walberla::uint8_t flag_t;
 typedef FlagField< flag_t > FlagField_T;
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 using namespace std::placeholders;
@@ -134,7 +134,7 @@ int main(int argc, char** argv)
 {
    walberla::Environment walberlaEnv(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 #endif
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -157,8 +157,8 @@ int main(int argc, char** argv)
       const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
       const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
 
-      const double remainingTimeLoggerFrequency =
-         parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+      const real_t remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
       // create fields
       BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
@@ -166,11 +166,11 @@ int main(int argc, char** argv)
       BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      BlockDataID pdfFieldIDGPU = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
+      BlockDataID pdfFieldIDGPU = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
       BlockDataID velFieldIDGPU =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
       BlockDataID densityFieldIDGPU =
-         cuda::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
 #endif
 
       BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
@@ -180,7 +180,7 @@ int main(int argc, char** argv)
       pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
       for (auto& block : *blocks)
          setterSweep(&block);
-      cuda::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
+      gpu::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
 #else
       pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID);
       for (auto& block : *blocks)
@@ -192,11 +192,11 @@ int main(int argc, char** argv)
       // This way of using alternating pack infos is temporary and will soon be replaced
       // by something more straight-forward
 
-      cuda::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
+      gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
       comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
       auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); });
 
-      cuda::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
+      gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
       comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
       auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); });
 #else
@@ -270,8 +270,8 @@ int main(int argc, char** argv)
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
          vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
-            cuda::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
+            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
+            gpu::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
          });
 #endif
          auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
index e998b35efa9e5e01dc20e7dc91f4791321ae58b0..52d29a0fb422a544212d4ff8d6e1a2fb763e6604 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -15,7 +15,7 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
 if (WALBERLA_BUILD_WITH_CUDA)
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core cuda field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 else ()
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index b757f3a5b306e9a7dc75e38da5551ad3098c483a..c8992a65afb93fa7dae572959a654651f14aabde 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -44,11 +44,11 @@
 ////////////////////////////
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/MemcpyPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/MemcpyPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #else
 #   include <blockforest/communication/UniformBufferedScheme.h>
 #endif
@@ -61,14 +61,14 @@ using flag_t      = walberla::uint8_t;
 using FlagField_T = FlagField< flag_t >;
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 int main(int argc, char** argv)
 {
    mpi::Environment env(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 #endif
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -95,14 +95,14 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
-         cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
 #else
       BlockDataID lb_phase_field =
          field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
@@ -128,7 +128,7 @@ int main(int argc, char** argv)
             initPhaseField_RTI(blocks, phase_field);
          }
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-         cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+         gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
 #endif
          WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field done")
       }
@@ -154,7 +154,7 @@ int main(int argc, char** argv)
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
       auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
@@ -162,7 +162,7 @@ int main(int argc, char** argv)
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
 
       auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
@@ -183,7 +183,7 @@ int main(int argc, char** argv)
       Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
 #endif
 
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
       auto boundariesConfig = config->getBlock("Boundaries_GPU");
@@ -206,10 +206,10 @@ int main(int argc, char** argv)
       }
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
+      int const streamLowPriority  = 0;
+      int const streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 #endif
 
       auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
@@ -296,14 +296,14 @@ int main(int argc, char** argv)
             timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
             "remaining time logger");
 
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 1)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
          vtkOutput->addBeforeFunction(
-            [&]() { cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
+            [&]() { gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
 #endif
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
          vtkOutput->addCellDataWriter(phaseWriter);
diff --git a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
index 2acb0a54620bef5d8ab2fce03f23255cabc08c91..6f21d16ad34beffce6a711ae27d4adb42cc4b049 100644
--- a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
+++ b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
@@ -890,7 +890,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
index e3bc4299cf7e3d95bf08fa8b1ee7687044b575d1..3c75d4ab686d7f55021fe12bf844c54b74901da3 100644
--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -2623,7 +2623,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index 9916a71e18fa09d490a2b4bdb16029136a6106a3..3b4a77a570ad86d2adc95789f0a58cda3a3dd4e9 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -64,7 +64,7 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
+   mpi::Environment const env(argc, argv);
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
@@ -85,7 +85,7 @@ int main(int argc, char** argv)
       // Creating fields
       BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs");
       BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
-      BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
+      BlockDataID const densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
 
       // Initialize velocity on cpu
       if (initShearFlow)
@@ -111,14 +111,14 @@ int main(int argc, char** argv)
             WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
          }
       }
-      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+      Cell const innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
 
       LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell);
       pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
 
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
       auto boundariesConfig   = config->getBlock("Boundaries");
       bool boundaries         = false;
       if (boundariesConfig)
@@ -244,7 +244,7 @@ int main(int argc, char** argv)
 
       timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -264,12 +264,12 @@ int main(int argc, char** argv)
       ///                                               BENCHMARK                                                    ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
-      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
+      int const warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
+      int const outerIterations = parameters.getParameter< int >("outerIterations", 1);
       for (int i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      real_t remainingTimeLoggerFrequency =
+      real_t const remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
       if (remainingTimeLoggerFrequency > 0)
       {
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 890d77f124479016436f5b0de7ba3f9eb49384fa..b1f74c57130935614f1e86c71d31a003afc27b7a 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
@@ -15,19 +14,19 @@ foreach(streaming_pattern pull push aa esotwist)
             waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                     FILE UniformGridGPU.py
                     CODEGEN_CFG ${config}
-                    OUT_FILES   UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
-                    UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
-                    UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
-                    UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
-                    UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
-                    UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
-                    UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
+                    OUT_FILES   UniformGridGPU_LbKernel.${CODEGEN_FILE_SUFFIX} UniformGridGPU_LbKernel.h
+                    UniformGridGPU_PackInfoEven.${CODEGEN_FILE_SUFFIX} UniformGridGPU_PackInfoEven.h
+                    UniformGridGPU_PackInfoOdd.${CODEGEN_FILE_SUFFIX} UniformGridGPU_PackInfoOdd.h
+                    UniformGridGPU_NoSlip.${CODEGEN_FILE_SUFFIX} UniformGridGPU_NoSlip.h
+                    UniformGridGPU_UBB.${CODEGEN_FILE_SUFFIX} UniformGridGPU_UBB.h
+                    UniformGridGPU_MacroSetter.${CODEGEN_FILE_SUFFIX} UniformGridGPU_MacroSetter.h
+                    UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX} UniformGridGPU_StreamOnlyKernel.h
                     UniformGridGPU_InfoHeader.h
                     )
 
             waLBerla_add_executable(NAME UniformGridGPU_${config}
                     FILES UniformGridGPU.cpp
-                    DEPENDS blockforest boundary core cuda domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
+                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
 
             # all configs are excluded from all except for pull d3q27.
             if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index 4d2ee1afaf27a1bf73514a11c3ab19bd092bd40b..7a3885d3b686d0967f7e7825ea109b8051393309 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -27,13 +27,6 @@
 #include "core/timing/RemainingTimeLogger.h"
 #include "core/timing/TimingPool.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
-
 #include "field/AddToStorage.h"
 #include "field/FlagField.h"
 #include "field/communication/PackInfo.h"
@@ -53,20 +46,27 @@
 
 #include "InitShearVelocity.h"
 #include "UniformGridGPU_InfoHeader.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/lbm/CombinedInPlaceGpuPackInfo.h"
 using namespace walberla;
 
 using FlagField_T = FlagField< uint8_t >;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
-   cuda::selectDeviceBasedOnMpiRank();
+   mpi::Environment const env(argc, argv);
+   gpu::selectDeviceBasedOnMpiRank();
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
       WALBERLA_MPI_WORLD_BARRIER()
 
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                        SETUP AND CONFIGURATION                                             ///
@@ -85,7 +85,7 @@ int main(int argc, char** argv)
       const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
 
       // Creating fields
-      BlockDataID pdfFieldCpuID =
+      BlockDataID const pdfFieldCpuID =
          field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_c(std::nan("")), field::fzyx);
       BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
 
@@ -96,10 +96,10 @@ int main(int argc, char** argv)
          initShearVelocity(blocks, velFieldCpuID);
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
+      BlockDataID const pdfFieldGpuID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
       // Velocity field is copied to the GPU
       BlockDataID velFieldGpuID =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
 
       pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
 
@@ -116,14 +116,14 @@ int main(int argc, char** argv)
          { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
       }
 
-      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
-      bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
+      Cell const innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+      bool const cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
       Vector3< int32_t > gpuBlockSize =
          parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
 
       int streamHighPriority = 0;
       int streamLowPriority  = 0;
-      WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
@@ -132,7 +132,7 @@ int main(int argc, char** argv)
       using LbSweep      = lbm::UniformGridGPU_LbKernel;
       using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
       using PackInfoOdd  = lbm::UniformGridGPU_PackInfoOdd;
-      using cuda::communication::UniformGPUScheme;
+      using gpu::communication::UniformGPUScheme;
 
       LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
       lbSweep.setOuterPriority(streamHighPriority);
@@ -142,7 +142,7 @@ int main(int argc, char** argv)
 
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
       auto boundariesConfig   = config->getBlock("Boundaries");
       bool boundaries         = false;
       if (boundariesConfig)
@@ -174,19 +174,19 @@ int main(int argc, char** argv)
       ///                                          TIME STEP DEFINITIONS                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
 
-      auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
+      auto boundarySweep = [&](IBlock* block, uint8_t t, gpuStream_t stream) {
          noSlip.run(block, t, stream);
          ubb.run(block, t, stream);
       };
 
-      auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
+      auto boundaryInner = [&](IBlock* block, uint8_t t, gpuStream_t stream) {
          noSlip.inner(block, t, stream);
          ubb.inner(block, t, stream);
       };
 
-      auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
+      auto boundaryOuter = [&](IBlock* block, uint8_t t, gpuStream_t stream) {
          noSlip.outer(block, t, stream);
          ubb.outer(block, t, stream);
       };
@@ -270,7 +270,7 @@ int main(int argc, char** argv)
       timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
       // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -278,8 +278,7 @@ int main(int argc, char** argv)
          auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
-         vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+         vtkOutput->addBeforeFunction([&]() { gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
          });
          timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
@@ -288,12 +287,12 @@ int main(int argc, char** argv)
       ///                                               BENCHMARK                                                    ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
-      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
+      int const warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
+      int const outerIterations = parameters.getParameter< int >("outerIterations", 1);
       for (int i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      double remainingTimeLoggerFrequency =
+      real_t const remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
       if (remainingTimeLoggerFrequency > 0)
       {
@@ -304,16 +303,16 @@ int main(int argc, char** argv)
 
       for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
       {
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
          timeLoop.setCurrentTimeStepToZero();
          WcTimer simTimer;
-         cudaDeviceSynchronize();
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
          WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
          simTimer.start();
          timeLoop.run();
-         cudaDeviceSynchronize();
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          simTimer.end();
          WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
          auto time      = real_c(simTimer.last());
@@ -340,5 +339,5 @@ int main(int argc, char** argv)
       }
    }
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
index 48fca7135ba30b8f546b421c1329e847e0b6d129..f6199b1ec6a1dcade811d79e67b3555f26332c9d 100644
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
@@ -14,15 +14,15 @@
 #include "timeloop/all.h"
 #include "core/math/Random.h"
 #include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/NVTX.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/NVTX.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/DeviceSelectMPI.h"
 #include "domain_decomposition/SharedSweep.h"
 
 #include "UniformGridGPU_LatticeModel.h"
@@ -48,7 +48,7 @@ const auto Q = LatticeModel_T::Stencil::Q;
 using Stencil_T = LatticeModel_T::Stencil;
 using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
 using PdfField_T = GhostLayerField<real_t, Q>;
-using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+using CommScheme_T = gpu::communication::UniformGPUScheme<CommunicationStencil_T>;
 using VelocityField_T = GhostLayerField<real_t, 3>;
 using flag_t = walberla::uint8_t;
 using FlagField_T = FlagField<flag_t>;
@@ -56,7 +56,7 @@ using FlagField_T = FlagField<flag_t>;
 int main( int argc, char **argv )
 {
    mpi::Environment env( argc, argv );
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 
    for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
    {
@@ -96,7 +96,7 @@ int main( int argc, char **argv )
           initialComm();
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID pdfFieldGpuID = gpu::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
       BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
 
 
@@ -155,13 +155,13 @@ int main( int argc, char **argv )
                                                     gpuBlockSize[0], gpuBlockSize[1],
                                                     Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
       lbKernel.setOuterPriority( streamHighPriority );
-      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
+      UniformGridGPU_Communication< CommunicationStencil_T, gpu::GPUField< double > >
          gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
 
-      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
+      auto defaultStream = gpu::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = gpu::ParallelStreams( streamHighPriority );
 
       uint_t currentTimeStep = 0;
 
@@ -177,12 +177,12 @@ int main( int argc, char **argv )
 
       auto overlapTimeStep = [&]()
       {
-         cuda::NvtxRange namedRange("timestep");
+         gpu::NvtxRange namedRange("timestep");
          auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
 
          innerOuterSection.run([&]( auto innerStream )
          {
-            cuda::nameStream(innerStream, "inner stream");
+            gpu::nameStream(innerStream, "inner stream");
             for( auto &block: *blocks )
             {
                if(!disableBoundaries)
@@ -197,7 +197,7 @@ int main( int argc, char **argv )
 
          innerOuterSection.run([&]( auto outerStream )
          {
-            cuda::nameStream(outerStream, "outer stream");
+            gpu::nameStream(outerStream, "outer stream");
             gpuComm( outerStream );
 
             for( auto &block: *blocks )
@@ -215,7 +215,7 @@ int main( int argc, char **argv )
       };
 
 
-      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryStreams = gpu::ParallelStreams( streamHighPriority );
       auto normalTimeStep = [&]()
       {
          gpuComm();
@@ -268,7 +268,7 @@ int main( int argc, char **argv )
          auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
          vtkOutput->addBeforeFunction( [&]() {
-             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             gpu::fieldCpy<PdfField_T, gpu::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
              for( auto & block : *blocks )
                  getterSweep( &block );
          });
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
index aadf51331d507ef9bb12f3fc010606f18b599491..20c301bd3b5cfecc37f332442f3dc2396a7771da 100644
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
@@ -7,9 +7,9 @@
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/communication/MemcpyPackInfo.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/communication/MemcpyPackInfo.h"
 #include "UniformGridGPU_PackInfo.h"
 
 
@@ -36,28 +36,28 @@ public:
           _gpuCommunicationScheme(nullptr), _directScheme(nullptr)
     {
         auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
-        auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
+        auto memcpyPackInfo = make_shared< gpu::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
         auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
         auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );
 
         switch(_commSchemeType)
         {
             case GPUPackInfo_Baseline:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                 _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                 _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                 break;
             case GPUPackInfo_Streams:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                 _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                 _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                 break;
             case UniformGPUScheme_Baseline:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                 _gpuCommunicationScheme->addPackInfo( generatedPackInfo );
                 break;
             case UniformGPUScheme_Memcpy:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                 _gpuCommunicationScheme->addPackInfo( memcpyPackInfo );
                 break;
             case MPIDatatypes:
@@ -151,7 +151,7 @@ public:
 private:
     CommunicationSchemeType _commSchemeType;
     shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme;
-    shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
-    shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
+    shared_ptr< gpu::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
+    shared_ptr< gpu::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
     shared_ptr< blockforest::communication::UniformDirectScheme<StencilType> > _directScheme;
 };
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index 50d9bfd756b4bc5d463ac9848e084945e32da1bd..8de01dacf51ed5e94ac651a5ca61f50988bd3416 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -129,7 +129,7 @@ class Scenario:
         num_tries = 4
         # check multiple times e.g. may fail when multiple benchmark processes are running
         table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
-        table_name = table_name.replace("-", "_")
+        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
         for num_try in range(num_tries):
             try:
                 checkAndUpdateSchema(result, table_name, DB_FILE)
diff --git a/apps/pythonmodule/CMakeLists.txt b/apps/pythonmodule/CMakeLists.txt
index 5ea0decea67007ad01fb8e155a8667fe290dd64f..d2c6251c1e8a9efb7d7918cf620d1f81f01a1769 100644
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
@@ -3,8 +3,8 @@ if( NOT TARGET python_coupling )
     message( WARNING "python module ist not build since the python_coupling target is non-existent" )
 else()
 if ( WALBERLA_BUILD_WITH_PYTHON )
-    if ( WALBERLA_BUILD_WITH_CUDA )
-        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk cuda)
+    if ( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk gpu)
     else()
         set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk)
     endif()
diff --git a/apps/pythonmodule/PythonModule.cpp b/apps/pythonmodule/PythonModule.cpp
index 3059e3f059e2fa110dda1681859693cb6616fc44..11fada5098e505528b4698f33f572a2f29ab586d 100644
--- a/apps/pythonmodule/PythonModule.cpp
+++ b/apps/pythonmodule/PythonModule.cpp
@@ -28,8 +28,8 @@
 
 #include "stencil/all.h"
 
-#ifdef WALBERLA_BUILD_WITH_CUDA
- #include "python_coupling/export/CUDAExport.h"
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+ #include "python_coupling/export/GPUExport.h"
 #endif
 
 
@@ -75,11 +75,11 @@ struct InitObject
       pythonManager->addExporterFunction(blockforest::exportModuleToPython<stencil::D2Q5, stencil::D2Q9, stencil::D3Q7, stencil::D3Q19, stencil::D3Q27>);
       // VTK
       pythonManager->addExporterFunction( vtk::exportModuleToPython );
-      #ifdef WALBERLA_BUILD_WITH_CUDA
-            using walberla::cuda::GPUField;
+      #ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+            using walberla::gpu::GPUField;
 
-            pythonManager->addExporterFunction( cuda::exportModuleToPython<GPU_FIELD_TYPES> );
-            pythonManager->addExporterFunction( cuda::exportCopyFunctionsToPython<FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportModuleToPython<GPU_FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportCopyFunctionsToPython<FIELD_TYPES> );
             pythonManager->addBlockDataConversion<GPU_FIELD_TYPES>();
       #endif
       //
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
index 9116f0b19b73f55ecca6994c4668d0bf85fe15e3..95b852203ce277cf293011694e9f39e8063417c9 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
@@ -18,4 +18,4 @@ waLBerla_generate_target_from_python(NAME PhaseFieldCodeGenGPU
 
 waLBerla_add_executable(NAME multiphaseGPU
         FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp util.cpp multiphase_codegen.py
-        DEPENDS blockforest core cuda field postprocessing python_coupling lbm geometry timeloop PhaseFieldCodeGenGPU)
+        DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop PhaseFieldCodeGenGPU)
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index 8d5f3c49869c289c0e93130d5dfc07b6e9158f0f..2800b98cb65008ef5d66aa98853fb5589087c8d5 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -25,11 +25,11 @@
 #include "core/math/Constants.h"
 #include "core/timing/RemainingTimeLogger.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "cuda/NVTX.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/communication/UniformGPUScheme.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/NVTX.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
 
 #include "field/AddToStorage.h"
 #include "field/FlagField.h"
@@ -67,13 +67,13 @@ using namespace walberla;
 
 using FlagField_T = FlagField< uint8_t >;
 
-typedef cuda::GPUField< real_t > GPUField;
-typedef cuda::GPUField< uint8_t > GPUField_int;
+typedef gpu::GPUField< real_t > GPUField;
+typedef gpu::GPUField< uint8_t > GPUField_int;
 
 int main(int argc, char** argv)
 {
    mpi::Environment Env(argc, argv);
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
    exportDataStructuresToPython();
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -114,17 +114,17 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
-         cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
       // Flag field
       BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-      BlockDataID flagFieldID_gpu = cuda::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
+      BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
 
       auto physical_parameters     = config->getOneBlock("PhysicalParameters");
       const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
@@ -195,11 +195,11 @@ int main(int argc, char** argv)
       //////////////////////
       int streamLowPriority  = 0;
       int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 
       auto UniformGPUSchemeVelocityDistributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
@@ -211,7 +211,7 @@ int main(int argc, char** argv)
          std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); });
 
       auto UniformGPUSchemePhaseField =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
       auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); });
@@ -220,7 +220,7 @@ int main(int argc, char** argv)
       auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); });
 
       auto UniformGPUSchemePhaseFieldDistributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
@@ -255,7 +255,7 @@ int main(int argc, char** argv)
          }
          geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
-      cuda::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldID_gpu, flagFieldID);
+      gpu::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldID_gpu, flagFieldID);
 
       lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, lb_phase_field_gpu);
       lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, lb_velocity_field_gpu);
@@ -293,8 +293,8 @@ int main(int argc, char** argv)
             smear_interface();
          }
       }
-      cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs")
       for (auto& block : *blocks)
@@ -314,9 +314,9 @@ int main(int argc, char** argv)
             [&]() {
                if (timeloop.getCurrentTimeStep() % dbWriteFrequency == 0)
                {
-                  cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-                  cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
-                  WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+                  gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
+                  gpu::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+                  WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
                   if (scenario == 4)
                   {
@@ -411,17 +411,17 @@ int main(int argc, char** argv)
          timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
          "remaining time logger");
 
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          const std::string path = "vtk_out";
          auto vtkOutput         = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, path,
                                                          "simulation_step", false, true, true, false, 0);
          vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+            gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
+            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
          });
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T, float > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
@@ -435,20 +435,20 @@ int main(int argc, char** argv)
          timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
-      lbm::PerformanceEvaluation< FlagField_T > performance(blocks, flagFieldID, fluidFlagUID);
+      lbm::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
       WcTimingPool timeloopTiming;
       WcTimer simTimer;
 
       WALBERLA_MPI_WORLD_BARRIER()
       cudaDeviceSynchronize();
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
       simTimer.start();
       timeloop.run(timeloopTiming);
 
       cudaDeviceSynchronize();
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       simTimer.end();
       auto time = real_c(simTimer.max());
diff --git a/apps/tutorials/CMakeLists.txt b/apps/tutorials/CMakeLists.txt
index 4eb4eb533bf24491a8ea216ac08a5f05e17ec436..fbb863629caddc3622acbef3da29d5a21162947e 100644
--- a/apps/tutorials/CMakeLists.txt
+++ b/apps/tutorials/CMakeLists.txt
@@ -3,8 +3,8 @@ add_subdirectory(lbm)
 add_subdirectory(mesa_pd)
 add_subdirectory(pde)
 add_subdirectory(pe)
-if( WALBERLA_BUILD_WITH_CUDA )
-    add_subdirectory(cuda)
+if( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+    add_subdirectory(gpu)
 endif()
 if( WALBERLA_BUILD_WITH_CODEGEN )
     add_subdirectory(codegen)
diff --git a/apps/tutorials/codegen/01_CodegenHeatEquation.dox b/apps/tutorials/codegen/01_CodegenHeatEquation.dox
index ad521e0668e40d438fb7d21723be84bdf26a733e..653ec548c58f3c5ad4aa719bab1f6c1d62430746 100644
--- a/apps/tutorials/codegen/01_CodegenHeatEquation.dox
+++ b/apps/tutorials/codegen/01_CodegenHeatEquation.dox
@@ -90,7 +90,7 @@ with CodeGeneration() as ctx:
     generate_sweep(ctx, 'HeatEquationKernel', ac)
 \endcode
 
-The `CodeGeneration` context and the function `generate_sweep` are provided by waLBerla. `generate_sweep` takes the desired class name and the update rule. It then generates the kernel and builds a C++ class around it. We choose `HeatEquationKernel` as the class name. Through the `CodeGeneration` context, the waLBerla build system gives us access to a list of CMake variables. With `ctx.cuda` for example, we can ask if waLBerla was built with support for using NVIDIA GPUs and thus we can directly generate CUDA code with pystencils. In the scope of this first tutorial, we will not make use of this.
+The `CodeGeneration` context and the function `generate_sweep` are provided by waLBerla. `generate_sweep` takes the desired class name and the update rule. It then generates the kernel and builds a C++ class around it. We choose `HeatEquationKernel` as the class name. Through the `CodeGeneration` context, the waLBerla build system gives us access to a list of CMake variables. With `ctx.gpu` for example, we can ask if waLBerla was built with support for using GPUs (either by using CUDA for NVIDIA GPUs or HIP for AMD GPUs) and thus we can directly generate device code with pystencils. In the scope of this first tutorial, we will not make use of this.
 
 The code generation script will later be called by the build system while compiling the application. The complete script looks like this:
 
diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
index 2a83abba2c9c05726d79ed96caa8b6bfb41b8d9c..c7b9c902488989f8dfca6a9b5078619b6851dc34 100644
--- a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
@@ -114,16 +114,16 @@ int main(int argc, char** argv)
 
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
    const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    ///////////////////
    /// Field Setup ///
    ///////////////////
 
-   LatticeModel_T latticeModel = LatticeModel_T(omega);
-   BlockDataID pdfFieldId      = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, field::fzyx);
-   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   LatticeModel_T const latticeModel = LatticeModel_T(omega);
+   BlockDataID const pdfFieldId      = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, field::fzyx);
+   BlockDataID const flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    ////////////////////////
    /// Shear Flow Setup ///
@@ -131,7 +131,7 @@ int main(int argc, char** argv)
 
    auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
    ShearFlowInit shearFlowInitFunc(blocks, shearFlowSetup);
-   lbm::initializer::PdfFieldInitializer< LatticeModel_T > fieldInit(pdfFieldId, blocks);
+   lbm::initializer::PdfFieldInitializer< LatticeModel_T > const fieldInit(pdfFieldId, blocks);
    fieldInit.initDensityAndVelocity(shearFlowInitFunc);
 
    /////////////////////////
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
index 5b7790c7d31b6730f410a1b99460c3aa51c1cff5..1856106c5b61880752c7216ee10eabda140485b1 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -22,13 +22,10 @@
 
 #include "core/all.h"
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #endif
 
 #include "domain_decomposition/all.h"
@@ -71,8 +68,8 @@ typedef walberla::uint8_t flag_t;
 typedef FlagField< flag_t > FlagField_T;
 typedef lbm::CumulantMRTNoSlip NoSlip_T;
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 //////////////////////////////////////////
@@ -84,8 +81,8 @@ void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& block
 {
    math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42));
 
-   real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
-   real_t noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
+   real_t const velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
+   real_t const noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
 
    auto n_y = real_c(blocks->getNumberOfYCells());
 
@@ -128,8 +125,8 @@ int main(int argc, char** argv)
 
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
    const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
    const uint_t VTKwriteFrequency = parameters.getParameter< uint_t >("VTKwriteFrequency", 1000);
 
    ////////////////////////////////////
@@ -138,16 +135,16 @@ int main(int argc, char** argv)
 
    // Common Fields
    BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   BlockDataID const flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
    // GPU Field for PDFs
-   BlockDataID pdfFieldId = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+   BlockDataID const pdfFieldId = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
       blocks, "pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t(1));
 
    // GPU Velocity Field
    BlockDataID velocityFieldIdGPU =
-      cuda::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
+      gpu::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
 #else
    // CPU Field for PDFs
    BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx);
@@ -157,11 +154,11 @@ int main(int argc, char** argv)
    auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
    initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup);
 
-   real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
+   real_t const rho = shearFlowSetup.getParameter("rho", real_c(1.0));
 
    // pdfs setup
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
    pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho);
 #else
    pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
@@ -176,10 +173,10 @@ int main(int argc, char** argv)
    /// Sweep ///
    /////////////
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   pystencils::CumulantMRTSweep const CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
 #else
-   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+   pystencils::CumulantMRTSweep const CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
 #endif
 
    /////////////////////////
@@ -204,8 +201,9 @@ int main(int argc, char** argv)
    SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 
    // Communication
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::communication::UniformGPUScheme< Stencil_T > com(blocks, 0);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   const bool sendDirectlyFromGPU = false;
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU);
    com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
    auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
 #else
@@ -227,10 +225,10 @@ int main(int argc, char** argv)
       auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0,
                                                       false, path, "simulation_step", false, true, true, false, 0);
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
       // Copy velocity data to CPU before output
       vtkOutput->addBeforeFunction(
-         [&]() { cuda::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
+         [&]() { gpu::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
 #endif
 
       auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
index f3882b26c5f1cebe8a426661d5ddbd61b9bc1a15..0e3ace4036f4d7003738532adf3f225b0e8f39c4 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
@@ -7,7 +7,7 @@ namespace walberla{
 
 This tutorial demonstrates how to use [pystencils](https://pycodegen.pages.i10git.cs.fau.de/pystencils) and [lbmpy](https://pycodegen.pages.i10git.cs.fau.de/lbmpy) to generate highly optimised and hardware-specific Lattice Boltzmann simulation code within the waLBerla framework. Other than in \ref tutorial_codegen02, we will be generating a full LBM sweep instead of a lattice model class. Furthermore, we will generate a communication pack info class and a sweep to initialise the PDF field. A hardware-specific implementation of a NoSlip boundary handler will also be generated. Those components will then be combined in a waLBerla application for simulating the same shear flow scenario as in the previous tutorial.
 
-For large-scale LB simulations, the highly parallel design of a general-purpose graphics processing unit (GPGPU) can yield significant improvements in performance. The waLBerla framework relies on CUDA to run simulations on NVIDIA GPUs. In this tutorial, we will also show how code generation can be used to generate native CUDA code for different kinds of kernels.
+For large-scale LB simulations, the highly parallel design of a general-purpose graphics processing unit (GPGPU) can yield significant improvements in performance. The waLBerla framework relies on CUDA/HIP to run simulations on NVIDIA or AMD GPUs. In this tutorial, we will also show how code generation can be used to generate native CUDA/HIP code for different kinds of kernels.
 
 In this tutorial, we will be using the more advanced cumulant-based multiple-relaxation-time (MRT) collision operator. Instead of relaxing the entire distribution functions toward their equilibrium values, their [cumulants](https://en.wikipedia.org/wiki/Cumulant) are relaxed with individual relaxation rates. We will also use the D2Q9 velocity set. For this velocity set, the zeroth- and first-order cumulants correspond to density and momentum which are conserved during collisions, so their relaxation rates can be set to zero. We will specify one common relaxation rate \f$ \omega \f$ for the three second-order cumulants to ensure the correct viscosity of the fluid; the higher-order cumulants will be set to their equilibrium values which correspond to a relaxation rate of 1.
 
@@ -64,7 +64,7 @@ pdfs_setter = macroscopic_values_setter(lbm_method,
                                         pdfs.center_vector)
 \endcode
 
-Everything is now prepared to generate the actual C++ code. We create the code generation context and evaluate the `ctx.cuda` flag to find out if waLBerla is configured to build GPU code. If CUDA is enabled, we set the `target` to `gpu`; otherwise to `cpu`.  The target is then passed to all code generation functions. If GPU code is to be generated, the generated classes will be implemented in `*.cu` files, and their sweeps will run on the GPU.
+Everything is now prepared to generate the actual C++ code. We create the code generation context and evaluate the `ctx.gpu` flag to find out if waLBerla is configured to build GPU code. If CUDA/HIP is enabled, we set the `target` to `gpu`; otherwise to `cpu`.  The target is then passed to all code generation functions. If GPU code is to be generated, the generated classes will be implemented in `*.cu` files for CUDA device code or `*.cpp` for HIP device code, and their sweeps will run on the GPU.
 
 Several functions from `pystencils_walberla` and `lbmpy_walberla` are called to generate the classes:
 
@@ -75,10 +75,7 @@ Several functions from `pystencils_walberla` and `lbmpy_walberla` are called to
 
 \code{.py}
 with CodeGeneration() as ctx:
-    if ctx.cuda:
-        target = ps.Target.GPU
-    else:
-        target = ps.Target.CPU
+    target = ps.Target.GPU if ctx.gpu else ps.Target.CPU
 
     #   LBM Sweep
     generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
@@ -93,7 +90,7 @@ with CodeGeneration() as ctx:
     generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target)
 \endcode
 
-As in \ref tutorial_codegen02, the classes generated by the above code need to be registered with CMake using the `walberla_generate_target_from_python` macro. Since the source file extension is different if CUDA code is generated (`*.cu` instead of `*.cpp`), the code generation target needs to be added twice. During the build process, the correct target is selected through the surrounding `if(WALBERLA_BUILD_WITH_CUDA)` block. Furthermore, the application depends on `cuda`, which is used from the waLBerla backend.
+As in \ref tutorial_codegen02, the classes generated by the above code need to be registered with CMake using the `walberla_generate_target_from_python` macro. Since the source file extension for device code can be different we use the macro `CODEGEN_FILE_SUFFIX`. This macro essentially switches to `*.cu` only if `CUDA` is used. During the build process, the correct target is selected through the surrounding `if(WALBERLA_BUILD_WITH_GPU_SUPPORT)` block, which makes the application depend on `gpu`. This referees to the `gpu` files in waLBerla.
 
 \section advancedlbmcodegen_application The waLBerla application
 
@@ -226,9 +223,9 @@ After the velocity field has been initialised, the generated `InitialPDFsSetter`
 
 The simulation is now ready to run.
 
-\subsection advancedlbmpy_cuda Differences in the GPU application
+\subsection advancedlbmpy_gpu Differences in the GPU application
 
-If CUDA is enabled, some implementation details need to be different from a CPU-only version. This mainly concerns the creation and management of fields, MPI communication and VTK output. Since the initialisation, LBM and NoSlip sweeps run entirely on the GPU, the PDF field has to be set up only in graphics memory. In contrast to that is the velocity field required by CPU and GPU. The shear flow velocity profile is constructed by CPU code before the initialisation kernel maps it onto the PDF field on the GPU. Also, the VTK output routines which run on the CPU need to read the velocity field. It thus needs to be created twice: Once in the main memory, and once in GPU memory. It is then copied on-demand from the GPU to the CPU. Furthermore, we create a flag field, which is only needed on the CPU. After the initialisation, we use it to create the index-vectors for the boundary-handling. The index vectors are then transferred to the GPU and not the entire flag field.
+If `GPU_SUPPORT` is enabled, some implementation details need to be different from a CPU-only version. This mainly concerns the creation and management of fields, MPI communication and VTK output. Since the initialisation, LBM and NoSlip sweeps run entirely on the GPU, the PDF field has to be set up only in graphics memory. In contrast to that is the velocity field required by CPU and GPU. The shear flow velocity profile is constructed by CPU code before the initialisation kernel maps it onto the PDF field on the GPU. Also, the VTK output routines which run on the CPU need to read the velocity field. It thus needs to be created twice: Once in the main memory, and once in GPU memory. It is then copied on-demand from the GPU to the CPU. Furthermore, we create a flag field, which is only needed on the CPU. After the initialisation, we use it to create the index-vectors for the boundary-handling. The index vectors are then transferred to the GPU and not the entire flag field.
 
 For the largest part, though, the C++ code is identical. The code snippets presented above represent only the CPU variant of the code. The GPU implementation can be found in the source file 03_AdvancedLBMCodegen.cpp. There, code blocks which are different from the CPU to the GPU implementation are toggled via preprocessor conditionals.
 
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
index b139c99998526e0717f6ed3a7fb249e7d504a30d..a1f5f9874e6e3589193ff719c87f3292fbed02b4 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
@@ -60,10 +60,7 @@ with CodeGeneration() as ctx:
                                             velocity.center_vector,
                                             pdfs.center_vector)
 
-    if ctx.cuda:
-        target = ps.Target.GPU
-    else:
-        target = ps.Target.CPU
+    target = ps.Target.GPU if ctx.gpu else ps.Target.CPU
 
     #   LBM Sweep
     generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt
index 339f648197b4d716c9357f19b5f03ad4fdb43ffd..2a56a5b671693abcaa241698d43d5ada5ddba916 100644
--- a/apps/tutorials/codegen/CMakeLists.txt
+++ b/apps/tutorials/codegen/CMakeLists.txt
@@ -24,25 +24,19 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
                     DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython )
 
     #   Tutorial 3: Advanced lbmpy Code Generation
-    if(WALBERLA_BUILD_WITH_CUDA)
-        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
-            FILE 03_AdvancedLBMCodegen.py
-            OUT_FILES   CumulantMRTSweep.cu CumulantMRTSweep.h
-                        CumulantMRTPackInfo.cu CumulantMRTPackInfo.h
-                        InitialPDFsSetter.cu InitialPDFsSetter.h
-                        CumulantMRTNoSlip.cu CumulantMRTNoSlip.h)
 
+    walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+        FILE 03_AdvancedLBMCodegen.py
+        OUT_FILES   CumulantMRTSweep.${CODEGEN_FILE_SUFFIX} CumulantMRTSweep.h
+                    CumulantMRTPackInfo.${CODEGEN_FILE_SUFFIX} CumulantMRTPackInfo.h
+                    InitialPDFsSetter.${CODEGEN_FILE_SUFFIX} InitialPDFsSetter.h
+                    CumulantMRTNoSlip.${CODEGEN_FILE_SUFFIX} CumulantMRTNoSlip.h)
+
+    if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
                         FILES 03_AdvancedLBMCodegen.cpp
-                        DEPENDS blockforest cuda core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+                        DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
     else()
-        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
-                FILE 03_AdvancedLBMCodegen.py
-                OUT_FILES   CumulantMRTSweep.cpp CumulantMRTSweep.h
-                CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h
-                InitialPDFsSetter.cpp InitialPDFsSetter.h
-                CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h)
-
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
                 FILES 03_AdvancedLBMCodegen.cpp
                 DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
deleted file mode 100644
index 518acb7e16f39b0136414fd52e9b480f4ee7b11d..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file 03_GameOfLife.cpp
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#include "01_GameOfLife_kernels.h"
-#include "cuda/HostFieldAllocator.h"
-#include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformDirectScheme.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-
-#include "core/Environment.h"
-
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/Kernel.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
-
-#include "field/AddToStorage.h"
-#include "field/communication/UniformMPIDatatypeInfo.h"
-#include "field/vtk/VTKWriter.h"
-
-#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
-#include "geometry/structured/GrayScaleImage.h"
-
-#include "gui/Gui.h"
-
-#include "stencil/D2Q9.h"
-
-#include "timeloop/SweepTimeloop.h"
-
-
-using namespace walberla;
-
-typedef GhostLayerField<double,1> ScalarField;
-typedef cuda::GPUField<double> GPUField;
-
-
-ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            double(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
-            );
-}
-
-class GameOfLifeSweepCUDA
-{
-   public:
-      GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
-         : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID )
-      {
-      }
-      void operator() ( IBlock * block )
-      {
-         auto srcCudaField = block->getData< cuda::GPUField<double> > ( gpuFieldSrcID_ );
-         auto dstCudaField = block->getData< cuda::GPUField<double> > ( gpuFieldDstID_ );
-
-         auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
-         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
-         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
-         myKernel();
-
-         srcCudaField->swapDataPointers( dstCudaField );
-      }
-   private:
-      BlockDataID gpuFieldSrcID_;
-      BlockDataID gpuFieldDstID_;
-};
-
-
-int main( int argc, char ** argv )
-{
-   walberla::Environment env( argc, argv );
-
-   geometry::GrayScaleImage image ("GosperGliderGun.png");
-
-   // Create blocks
-   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
-            uint_t(1) ,              uint_t(2),                           uint_t(1), // number of blocks in x,y,z direction
-            image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
-            real_t(1),                                                               // dx: length of one cell in physical coordinates
-            false,                                                                   // one block per process - "false" means all blocks to one process
-            false, false, false );                                                   // no periodicity
-
-
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-
-   // Initializing the field from an image
-   using geometry::initializer::ScalarFieldFromGrayScaleImage;
-   ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
-   fieldInitializer.init( image, uint_t(2), false );
-
-   BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
-   BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
-
-
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9 > CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
-   // Alternative, if CUDA enabled MPI is available
-   //blockforest::communication::UniformDirectScheme<stencil::D2Q9 >
-   //typedef field::communication::UniformMPIDatatypeInfo<GPUField> Packing
-
-   CommScheme commScheme(blocks);
-   commScheme.addDataToCommunicate( make_shared<Packing>(gpuFieldSrcID) );
-
-   // Create Timeloop
-   const uint_t numberOfTimesteps = uint_t(100); // number of timesteps for non-gui runs
-   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
-
-   // Registering the sweep
-   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
-                  << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
-
-   timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
-
-   // Register VTK output
-   timeloop.addFuncAfterTimeStep( field::createVTKOutput<ScalarField>( cpuFieldID, *blocks, "game_of_life" ) );
-   
-   // GUI output
-   GUI gui ( timeloop, blocks, argc, argv );
-   gui.run();
-
-   return 0;
-}
diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.dox b/apps/tutorials/cuda/01_GameOfLife_cuda.dox
deleted file mode 100644
index 7bbb50fe412080853588d0b1e0be0c8ed1a5e9a5..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.dox
+++ /dev/null
@@ -1,139 +0,0 @@
-namespace walberla{
-
-/**
-\page tutorial_cuda01 Tutorial - CUDA 1:  Game of Life on GPU
-
-
-\image html tutorial_cuda01_nvidia_titan.png
-
-> _Note:_ This tutorial required a CUDA aware MPI library.
-> If you get a SEGFAULT when executing this tutorial, make sure that your MPI library was built with
-> CUDA support! For instructions how to build OpenMPI with CUDA see this [page](https://www.open-mpi.org/faq/?category=building#build-cuda).
-
-\section cuda01_fields Creating Fields
-
-To run a simulation on a NVIDIA graphics card, we have to allocate data on the GPU and
-write a CUDA kernel that operates on this data. In this tutorial we first allocate a field on the GPU
-and learn about functionality to transfer data between CPU and GPU fields.
-
-Since initialization and output routines are usually not time critical, they are implemented
-for CPU fields only. In waLBerla we set up the complete simulation using
-CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
-end, copy everything back to do the output from the CPU field.
-So only the time critical kernels have to be written in CUDA.
-
-Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
-in a previous tutorial ( \ref tutorial_basics_03 ).
-One difference is, that fields which are often transfered from/to the GPU should be allocated with
-a different field allocator: cuda::HostFieldAllocator . This allocator uses cudaHostAlloc() instead of "new" ,
-such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
-Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
-mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
-
-\code
-ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            real_t(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
-            );
-}
-\endcode
-
-Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
-Then two GPU fields are created: "source" and "destination" field. The helper function
-cuda::addGPUFieldToStorage() creates a cuda::GPUField field of the same size and layout of the given
-CPU field:
-\code
-BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
-BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
-\endcode
-The contents of the new GPU fields are initialized with the contents of the given CPU field.
-
-
-
-\section cuda01_kernels Writing and calling CUDA kernels
-
-For a basic understanding of the CUDA support in waLBerla please read \ref cudaPage first.
-
-After reading this page you should know what a FieldAccessor is and how to call CUDA kernels from
-cpp files. So we can now start with writing
-a CUDA kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension.
-The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
-
-The kernel gets two field accessors as arguments, one for the source and one for the destination field.
-Both accessors have to be configured using the CUDA variables blockIdx and threadIdx, such that afterwards
-the get() and getNeighbor() functions of the accessor class can work correctly.
-\code
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
-{
-   src.set( blockIdx, threadIdx );
-   dst.set( blockIdx, threadIdx );
-   int liveNeighbors = 0;
-   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
-   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
-   // normal Game of Life algorithm ....
-   // ...
-}
-\endcode
-
-To call this kernel we write a thin wrapper sweep which only has to get the GPU fields out of the blockstorage
-and passes them to the CUDA kernel. We use the cuda::Kernel class from waLBerla here, so that we can write this
-sweep in a normal cpp file.
-Here are the contents of this sweep:
-\code
-auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
-auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
-
-auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
-myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
-myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
-myKernel();
-
-srcCudaField->swapDataPointers( dstCudaField );
-\endcode
-
-All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
-setup reasons.
-
-To see if our kernel works, we copy the contents back to the CPU field after every timestep:
-\code
-timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
-\endcode
-Of course this makes no sense for real simulations, since the transfer time is much higher than the
-time that was saved by doing the computation on the GPU. For production runs, one would usually transfer the
-field back every n'th timestep and write e.g. a VTK frame.
-
-
-\section cuda01_comm Communication
-
-In waLBerla there are two types of communication: _buffered_ and _direct_ communication.
-While buffered communication first collects all data in a buffer and sends only one message per communciation step and neighbor
-the direct communciation strategy, which is based on MPI datatypes, uses no intermediate buffers and therefore has to send
-more messages than buffered communication. For details see \ref walberla_communication .
-
-In the tutorials up to now, only the buffered approach was used. In this tutorial, we switch to the direct communciation strategy
-because then we can use the CUDA support of the MPI library to directly communciate from/to GPU memory.
-
-The usage of the two different communication schemes is very similar. Instead of creating a blockforest::communication::UniformBufferedScheme
-we create a blockforest::communication::UniformDirectScheme.
-Then we register a  field::communication::UniformMPIDatatypeInfo instead of the field::communication::PackInfo.
-
-\code
-typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
-CommScheme communication( blocks );
-communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
-\endcode
-
-This scheme also supports heterogeneous simulations, i.e. using a CPU field on
-some processes and a GPU field on other processes.
-
-*/
-
-
-}
diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.h b/apps/tutorials/cuda/01_GameOfLife_kernels.h
deleted file mode 100644
index 11f5eeba25800f18b9029092b510a9b36dfe1de0..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <iostream>
-
-#include "cuda/FieldAccessor.h"
-
-
-namespace walberla {
-
-
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  );
-
-
-} // namespace walberla
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.cpp b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e73ec19962dff4cb89f523e83f6466fb685b1c69
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp
@@ -0,0 +1,130 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file 03_GameOfLife.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "01_GameOfLife_kernels.h"
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/MemcpyPackInfo.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
+#include "geometry/structured/GrayScaleImage.h"
+
+#include "stencil/D2Q9.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+
+using namespace walberla;
+
+using ScalarField = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
+using CommScheme = gpu::communication::UniformGPUScheme<stencil::D2Q9 > ;
+using Packing = gpu::communication::MemcpyPackInfo<GPUField> ;
+
+
+ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   auto xSize = storage->getNumberOfXCells( *block );
+   auto ySize = storage->getNumberOfYCells( *block );
+   auto zSize = storage->getNumberOfZCells( *block );
+   auto numberOfGhostLayers = uint_c(1);
+   auto initialValue = real_c(0);
+   auto fieldLayout = field::fzyx;
+   return new ScalarField (xSize, ySize, zSize,
+                          numberOfGhostLayers, initialValue, fieldLayout,
+                          make_shared< gpu::HostFieldAllocator<real_t> >()  // allocator for host pinned memory
+                           );
+}
+
+
+int main( int argc, char ** argv )
+{
+   walberla::Environment const env( argc, argv );
+
+   geometry::GrayScaleImage const image ("GosperGliderGun.png");
+
+   // Create blocks
+   shared_ptr< StructuredBlockForest > const blocks = blockforest::createUniformBlockGrid (
+            uint_t(1) ,              uint_t(2),                           uint_t(1), // number of blocks in x,y,z direction
+            image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
+            real_t(1),                                                               // dx: length of one cell in physical coordinates
+            false,                                                                   // one block per process - "false" means all blocks to one process
+            false, false, false );                                                   // no periodicity
+
+
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+
+   // Initializing the field from an image
+   using geometry::initializer::ScalarFieldFromGrayScaleImage;
+   ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
+   fieldInitializer.init( image, uint_t(2), false );
+
+   BlockDataID const gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const gpuFieldDstID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+
+   const bool sendDirectlyFromGPU = false;
+   CommScheme commScheme(blocks, sendDirectlyFromGPU);
+   commScheme.addPackInfo( make_shared<Packing>(gpuFieldSrcID) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(101); // number of timesteps for non-gui runs
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme.getCommunicateFunctor(), "Communication" )
+                  << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
+
+   // VTK Writer every vtkWriteFrequency timesteps
+   const uint_t vtkWriteFrequency = 2;
+   if (vtkWriteFrequency > 0)
+   {
+      // Create a vtkOutput object with standard arguments
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency);
+
+      // Before the VTK output we need to sync the GPU data to the CPU memory
+      vtkOutput->addBeforeFunction(gpu::fieldCpyFunctor<ScalarField, GPUField >(blocks, cpuFieldID, gpuFieldDstID));
+
+      // Then create a dataWriter and write the output
+      auto dataWriter = make_shared< field::VTKWriter< ScalarField > >(cpuFieldID, "output");
+      vtkOutput->addCellDataWriter(dataWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   WcTimer simTimer;
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.start();
+   timeloop.run();
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   auto time      = real_c(simTimer.last());
+   WALBERLA_LOG_RESULT_ON_ROOT("Game of life tutorial finished. Elapsed time " << time)
+
+   return EXIT_SUCCESS;
+}
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.dox b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
new file mode 100644
index 0000000000000000000000000000000000000000..77c83e5f66df3f2cf6ab612099dd62c1bdbfcb69
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
@@ -0,0 +1,170 @@
+namespace walberla{
+
+/**
+\page tutorial_gpu01 Tutorial - GPU 1:  Game of Life on GPU
+
+\section gpu01_overview Overview
+
+In this tutorial, we will implement <a target="_blank" href="http://en.wikipedia.org/wiki/Conway%27s_Game_of_Life">Conway's Game of Life</a>,
+the algorithm which made cellular automata popular on graphics processing units (GPUs). This tutorial runs on NVIDIA GPUs with CUDA
+but can also run on AMD GPUs using HIP. waLBerla fully supports both libraries.
+For a basic understanding of the GPU support in waLBerla please read \ref gpuPage first.
+
+This tutorial is an extension of \ref tutorial_basics_03 to GPUs.
+
+\section gpu01_fields Creating Fields
+
+To run a simulation on a graphics processing unit (GPU), we have to allocate data on the GPU and
+write a kernel that operates on this data. In this tutorial we first allocate a field on the GPU
+and learn about functionality to transfer data between CPU and GPU fields.
+
+Since initialization and output routines are usually not time critical, they are implemented
+for CPU fields only. In waLBerla we set up the complete simulation using
+CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
+end, copy everything back to do the output from the CPU field.
+So only the time critical kernels have to be written for GPU.
+
+Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
+in a previous tutorial ( \ref tutorial_basics_03 ).
+One difference is, that fields which are often transferred from/to the GPU should be allocated with
+a different field allocator: gpu::HostFieldAllocator . This allocator uses gpuHostAlloc() instead of "new" ,
+such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
+Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
+mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
+
+\code
+ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new ScalarField (
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            real_t(0),                              // initial value
+            field::fzyx,                            // layout
+            make_shared<gpu::HostFieldAllocator<double> >()  // allocator for host pinned memory
+            );
+}
+\endcode
+
+Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
+Then two GPU fields are created: "source" and "destination" field. The helper function
+gpu::addGPUFieldToStorage() creates a gpu::GPUField field of the same size and layout of the given
+CPU field:
+\code
+BlockDataID gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+BlockDataID gpuFieldDstID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+\endcode
+The contents of the new GPU fields are initialized with the contents of the given CPU field.
+
+
+
+\section cuda01_kernels Writing and calling CUDA kernels
+
+After reading this page you should know what a FieldAccessor is and how to call GPU. So we can now start with writing
+a kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension (This is basically
+the only part that is different between CUDA and HIP).
+The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
+
+The kernel gets two field accessors as arguments, one for the source and one for the destination field.
+Both accessors have to be configured using the variables blockIdx and threadIdx from the CUDA or HIP library, such that afterwards
+the get() and getNeighbor() functions of the accessor class can work correctly.
+\code
+__global__ void gameOfLifeKernel( gpu::FieldAccessor<double> src, gpu::FieldAccessor<double> dst  )
+{
+   src.set( blockIdx, threadIdx );
+   dst.set( blockIdx, threadIdx );
+   int liveNeighbors = 0;
+   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   // normal Game of Life algorithm ....
+   // ...
+}
+\endcode
+
+To call this kernel we create a gpu::FieldIndexing object that receives a pointer to GPU fields. With this
+the blockDim and gridDim can be obtained as well as gpuAccess objects that contain the neighbouring information needed inside the GPU kernel.
+The kernel can be called normally with the three angle brackets.
+
+\code
+   auto srcCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldSrcID_ );
+   auto dstCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldDstID_ );
+
+   auto srcIndexing = gpu::FieldIndexing<real_t>::xyz( *srcCudaField );
+   auto dstIndexing = gpu::FieldIndexing<real_t>::xyz( *dstCudaField );
+
+   auto srcAccess = srcIndexing.gpuAccess();
+   auto dstAccess = dstIndexing.gpuAccess();
+
+   const dim3 gridDim = srcIndexing.gridDim();
+   const dim3 blockDim = srcIndexing.blockDim();
+
+   gameOfLifeKernel<<<gridDim, blockDim, 0, nullptr >>>(srcAccess, dstAccess );
+
+   srcCudaField->swapDataPointers( dstCudaField );
+\endcode
+
+All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
+setup reasons.
+
+\section gpu01_vtk VTK Output
+
+To see if our kernel works, we create a VTK writer. The VTK writer works on the CPU field. Thus it works exactly as in other
+examples. However, since our data is on GPU we need a `addBeforeFunction` that copies our data from host to device. This is done using the gpu::fieldCpyFunctor.
+Note that copying data is costly and thus we don't want to do this in every timestep usually. In this example it is only done every second timestep.
+
+\code
+   const uint_t vtkWriteFrequency = 2;
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency);
+      vtkOutput->addBeforeFunction(gpu::fieldCpyFunctor<ScalarField, GPUField >(blocks, cpuFieldID, gpuFieldDstID));
+
+      auto dataWriter = make_shared< field::VTKWriter< ScalarField > >(cpuFieldID, "output");
+      vtkOutput->addCellDataWriter(dataWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+\endcode
+
+\section gpu01_comm Communication
+
+For this tutorial we use the gpu::communication::UniformGPUScheme that first collects all data in a buffer and
+sends only one message per communication step and neighbor. For the PackInfo we use the MemcpyPackInfo. It receives
+a buffer located on the GPU and fills it using memcpy operations
+If the GPU library is build with MPI support this buffer can be send to other GPUs without a copy to the CPU.
+Otherwise the copying will be done in the back by the communication class.
+
+\code
+    using CommScheme = gpu::communication::UniformGPUScheme<stencil::D2Q9 > ;
+    using Packing = gpu::communication::MemcpyPackInfo<GPUField> ;
+    const bool sendDirectlyFromGPU = false;
+    CommScheme commScheme(blocks, sendDirectlyFromGPU);
+    commScheme.addPackInfo( make_shared<Packing>(gpuFieldSrcID) );
+\endcode
+
+\section gpu01_running Running the simulation
+
+To run the simulation we would like to point out a few common pitfalls to avoid. Basically it works very similar than the
+CPU equivalent. Since all Sweeps and Function calls are registered by the timeloop we can run the simulation using
+`timeloop.run();`. However, it is important to point out that kernel calls are asynchronous. Thus for time measurement purpose
+we need to make sure that all kernels are executed before stopping the timer. This can be done using `gpuDeviceSynchronize`.
+For good measure we also run this function right before starting the timer.
+
+\code
+   WcTimer simTimer;
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.start();
+   timeloop.run();
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   auto time      = real_c(simTimer.last());
+   WALBERLA_LOG_RESULT_ON_ROOT("Game of life tutorial finished. Elapsed time " << time)
+\endcode
+
+\image html GameOfLifeGPU.png
+
+*/
+
+
+}
diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.cu b/apps/tutorials/gpu/01_GameOfLife_kernels.cu
similarity index 52%
rename from apps/tutorials/cuda/01_GameOfLife_kernels.cu
rename to apps/tutorials/gpu/01_GameOfLife_kernels.cu
index 399f705c82e29d3d62b6f2c2a6db7ffaee6639ff..47a54ea7c8edbccd796661141315209604f1dba0 100644
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.cu
+++ b/apps/tutorials/gpu/01_GameOfLife_kernels.cu
@@ -1,13 +1,10 @@
-#include "../cuda/01_GameOfLife_kernels.h"
-
-#include <iostream>
-
+#include "../gpu/01_GameOfLife_kernels.h"
 
 
 namespace walberla {
 
 
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
+__global__ void gameOfLifeKernel( gpu::FieldAccessor<real_t> src, gpu::FieldAccessor<real_t> dst  )
 {
    src.set( blockIdx, threadIdx );
    dst.set( blockIdx, threadIdx );
@@ -34,6 +31,25 @@ __global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAc
       dst.get() = src.get();
 }
 
+void GameOfLifeSweepCUDA::operator()(IBlock * block)
+{
+   auto srcCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldSrcID_ );
+   auto dstCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldDstID_ );
+
+   auto srcIndexing = gpu::FieldIndexing<real_t>::xyz( *srcCudaField );
+   auto dstIndexing = gpu::FieldIndexing<real_t>::xyz( *dstCudaField );
+
+   auto srcAccess = srcIndexing.gpuAccess();
+   auto dstAccess = dstIndexing.gpuAccess();
+
+   const dim3 gridDim = srcIndexing.gridDim();
+   const dim3 blockDim = srcIndexing.blockDim();
+
+   gameOfLifeKernel<<<gridDim, blockDim, 0, nullptr >>>(srcAccess, dstAccess );
+
+   srcCudaField->swapDataPointers( dstCudaField );
+}
+
 
 
 
diff --git a/apps/tutorials/gpu/01_GameOfLife_kernels.h b/apps/tutorials/gpu/01_GameOfLife_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..663e3ac3f14d6e4ae3730d4fa3dda5600b0f5526
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_kernels.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/FieldIndexing.h"
+
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+namespace walberla {
+
+class GameOfLifeSweepCUDA
+{
+ public:
+   GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
+      : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID ){}
+
+   void operator() ( IBlock * block );
+
+ private:
+   BlockDataID gpuFieldSrcID_;
+   BlockDataID gpuFieldDstID_;
+};
+
+
+__global__ void gameOfLifeKernel(gpu::FieldAccessor<real_t> src, gpu::FieldAccessor<real_t> dst  );
+
+
+} // namespace walberla
diff --git a/apps/tutorials/cuda/CMakeLists.txt b/apps/tutorials/gpu/CMakeLists.txt
similarity index 67%
rename from apps/tutorials/cuda/CMakeLists.txt
rename to apps/tutorials/gpu/CMakeLists.txt
index efa4d2a554d84d69d4606594c4c2a50d4f65cf8e..14590ec05cc7b0dc8d8d7033fd9e1c3d3ffcce30 100644
--- a/apps/tutorials/cuda/CMakeLists.txt
+++ b/apps/tutorials/gpu/CMakeLists.txt
@@ -3,5 +3,5 @@ waLBerla_link_files_to_builddir( *.png )
 
 waLBerla_add_executable ( NAME 01_GameOfLife_cuda
                           FILES 01_GameOfLife_cuda.cpp 01_GameOfLife_kernels.cu
-                          DEPENDS blockforest core cuda field lbm geometry timeloop gui )                          
+                          DEPENDS blockforest core gpu field lbm geometry timeloop )
                                 
\ No newline at end of file
diff --git a/apps/tutorials/cuda/GosperGliderGun.png b/apps/tutorials/gpu/GosperGliderGun.png
similarity index 100%
rename from apps/tutorials/cuda/GosperGliderGun.png
rename to apps/tutorials/gpu/GosperGliderGun.png
diff --git a/apps/tutorials/lbm/01_BasicLBM.cpp b/apps/tutorials/lbm/01_BasicLBM.cpp
index 56845bc77e4c7a9dc4cf8da3fdf23b758ea7948f..6c1d920e9c046c0c9f0d93fd87422dc5c0406165 100644
--- a/apps/tutorials/lbm/01_BasicLBM.cpp
+++ b/apps/tutorials/lbm/01_BasicLBM.cpp
@@ -57,7 +57,7 @@ int main( int argc, char ** argv )
    const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
    const uint_t          timesteps       = parameters.getParameter< uint_t >         ( "timesteps",       uint_c( 10 )  );
 
-   const double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
+   const real_t remainingTimeLoggerFrequency = parameters.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) ); // in seconds
 
    // create fields
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::SRT( omega ) );
diff --git a/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp b/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
index b135758cd57b8214c40749b1b63a8c7dd0dc578b..3476c73d0e2b9613e09de513d7b33891aa832908 100644
--- a/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
+++ b/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
@@ -563,7 +563,7 @@ int main( int argc, char ** argv )
    const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
    const uint_t          timesteps       = parameters.getParameter< uint_t >         ( "timesteps",       uint_c( 10 )  );
 
-   const double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
+   const real_t remainingTimeLoggerFrequency = parameters.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) ); // in seconds
 
    // create lattice model
 
diff --git a/apps/tutorials/lbm/04_LBComplexGeometry.cpp b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
index dedc750f84b9354e9255365ba2da229aa1c3cdcb..6148efcc4ecde851cf742863b1299bd9a7247f84 100644
--- a/apps/tutorials/lbm/04_LBComplexGeometry.cpp
+++ b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
@@ -116,8 +116,8 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    //! [parseDomainParameters]
    // read domain parameters
diff --git a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
index ea41c7251223453cf4b52e8702b0ef10f33a233b..ae6f612cbb070298a16d0329d258d6256a217756 100644
--- a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
+++ b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
@@ -383,13 +383,13 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    // create fields
-   LatticeModel_T latticeModel = LatticeModel_T(lbm::collision_model::SRT(omega));
-   BlockDataID pdfFieldID  = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, initialVelocity, real_t(1));
-   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", FieldGhostLayers);
+   LatticeModel_T const latticeModel = LatticeModel_T(lbm::collision_model::SRT(omega));
+   BlockDataID const pdfFieldID  = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, initialVelocity, real_t(1));
+   BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", FieldGhostLayers);
 
    // create and initialize boundary handling
 
@@ -409,11 +409,11 @@ int main(int argc, char** argv)
    setup.omega = omega;
 
    //! [timeTracker]
-   std::shared_ptr< lbm::TimeTracker > timeTracker = std::make_shared< lbm::TimeTracker >();
+   std::shared_ptr< lbm::TimeTracker > const timeTracker = std::make_shared< lbm::TimeTracker >();
    //! [timeTracker]
 
    //! [boundaryHandlingID]
-   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
+   BlockDataID const boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
       MyBoundaryHandling(flagFieldID, pdfFieldID, setup, timeTracker), "boundary handling");
    //! [boundaryHandlingID]
 
@@ -453,7 +453,7 @@ int main(int argc, char** argv)
 
    auto vtkConfig = walberlaEnv.config()->getBlock("VTK");
 
-   uint_t writeFrequency = vtkConfig.getBlock("fluid_field").getParameter< uint_t >("writeFrequency", uint_t(100));
+   uint_t const writeFrequency = vtkConfig.getBlock("fluid_field").getParameter< uint_t >("writeFrequency", uint_t(100));
 
    auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "fluid_field", writeFrequency, FieldGhostLayers, false,
                                                    "vtk_out", "simulation_step", false, true, true, false, 0);
diff --git a/cmake/waLBerlaHelperFunctions.cmake b/cmake/waLBerlaHelperFunctions.cmake
index c601d2d395d4335e4f66b3550d6e5174d74cffc4..efd2d0576eaa072dbfc8b3c278bb0ad15002eb70 100644
--- a/cmake/waLBerlaHelperFunctions.cmake
+++ b/cmake/waLBerlaHelperFunctions.cmake
@@ -67,6 +67,7 @@ function( waLBerla_generate_target_from_python )
             "\"CODEGEN_CFG\": \"${codegenCfg}\","
             "\"WALBERLA_BUILD_WITH_MPI\": \"${WALBERLA_BUILD_WITH_MPI}\","
             "\"WALBERLA_BUILD_WITH_CUDA\": \"${WALBERLA_BUILD_WITH_CUDA}\","
+            "\"WALBERLA_BUILD_WITH_HIP\": \"${WALBERLA_BUILD_WITH_HIP}\","
             "\"WALBERLA_BUILD_WITH_OPENMP\": \"${WALBERLA_BUILD_WITH_OPENMP}\" \\\}"
             )
     string(REPLACE "\"" "\\\"" cmakeVars ${cmakeVars})   # even one more quoting level required
diff --git a/doc/Mainpage.dox b/doc/Mainpage.dox
index 8ba191634cf8a090b6d053db0a449cf50b41189f..f6a7ed06a90bd02b87e66b69137478c0a67f4430 100644
--- a/doc/Mainpage.dox
+++ b/doc/Mainpage.dox
@@ -47,6 +47,14 @@ all the basic data strcutures and concepts of the framework.
 - \ref tutorial_lbm06 \n
   This tutorial deals with the usage of different LBM boundary conditions.
 
+\subsection advanced_topics Advanced Topics
+
+\subsection gpu GPU
+
+- \ref tutorial_gpu01 \n
+  A simple tutorial for Game of Life on GPU
+
+
 \subsection codegen Code Generation
 
 - \ref tutorial_codegen01 \n
diff --git a/doc/pics/GameOfLifeGPU.png b/doc/pics/GameOfLifeGPU.png
new file mode 100644
index 0000000000000000000000000000000000000000..769b250806abbab16fd74691e8d2a0e3154826e4
Binary files /dev/null and b/doc/pics/GameOfLifeGPU.png differ
diff --git a/doc/setup.dox b/doc/setup.dox
index ab9ba0588aa13b53cb631ed84d226f2c3a03d520..4afde5ea5edee04d21c6429ee606959eae0f886b 100644
--- a/doc/setup.dox
+++ b/doc/setup.dox
@@ -68,6 +68,8 @@ WALBERLA_BUILD_WITH_OPENMP         |    OFF   | Enables/Disables OpenMP support
 WALBERLA_BUILD_TESTS		       |	OFF   | If enabled, all tests are built when running make in the root build folder. But you can always go to a specific directory in your test folder and manually run make.
 WALBERLA_BUILD_BENCHMARKS          |	ON    | Enables/Disables the automatic build of all benchmarks located in "apps/benchmarks".  
 WALBERLA_BUILD_WITH_PYTHON         |	OFF   | Enables Python Support inside waLBerla (embedded Python). Then you can use Python scripts as configuration files and start an embedded python interpreter that can access waLBerla data structures. This builds a shared library (and python module) walberla_cpp.so in "apps/pythonmodule" so that you can use walberla from python.
+WALBERLA_BUILD_WITH_CUDA           |    OFF   | Enables/Disables support to run waLBerla on NVIDIA GPUs.
+WALBERLA_BUILD_WITH_HIP            |    OFF   | Enables/Disables support to run waLBerla on AMD GPUs.
 
 For a list of all switches, see CMakeLists.txt in the root source folder.
 
diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py
index ee035a636a6ed08372999594352a971836777ec0..4fc9cf6e517d9b513511530eb05b5dab9eb10edd 100644
--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -107,7 +107,7 @@ def generate_boundary(generation_context,
     header = env.get_template('Boundary.tmpl.h').render(**context)
     source = env.get_template('Boundary.tmpl.cpp').render(**context)
 
-    source_extension = "cpp" if target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py
index 2656cac32f9939a65ee61f7fb74541355d279bb2..932e5ce69dbc8309c8000b53e2fd9a34b21e2f4a 100644
--- a/python/pystencils_walberla/cmake_integration.py
+++ b/python/pystencils_walberla/cmake_integration.py
@@ -20,6 +20,7 @@ DEFAULT_CMAKE_VARS = {'WALBERLA_BUILD_WITH_OPENMP': False,
                       'WALBERLA_DOUBLE_ACCURACY': True,
                       'WALBERLA_BUILD_WITH_MPI': True,
                       'WALBERLA_BUILD_WITH_CUDA': False,
+                      'WALBERLA_BUILD_WITH_HIP': False,
                       "CODEGEN_CFG": ""}
 
 PARSE_HELPER = {"on":  True,  "1": True,  "yes": True,  "true":  True,
@@ -73,6 +74,8 @@ class CodeGenerationContext:
         self.mpi = cmake_vars['WALBERLA_BUILD_WITH_MPI']
         self.double_accuracy = cmake_vars['WALBERLA_DOUBLE_ACCURACY']
         self.cuda = cmake_vars['WALBERLA_BUILD_WITH_CUDA']
+        self.hip = cmake_vars['WALBERLA_BUILD_WITH_HIP']
+        self.gpu = self.cuda or self.hip
         self.config = cmake_vars['CODEGEN_CFG'].strip()
 
     def write_file(self, name, content):
@@ -87,13 +90,16 @@ class ManualCodeGenerationContext:
     to constructor instead of getting them from CMake
     """
 
-    def __init__(self, openmp=False, optimize_for_localhost=False, mpi=True, double_accuracy=True, cuda=False):
+    def __init__(self, openmp=False, optimize_for_localhost=False, mpi=True, double_accuracy=True,
+                 cuda=False, hip=False):
         self.openmp = openmp
         self.optimize_for_localhost = optimize_for_localhost
         self.mpi = mpi
         self.double_accuracy = double_accuracy
         self.files = dict()
         self.cuda = cuda
+        self.hip = hip
+        self.gpu = self.cuda or self.hip
         self.config = ""
 
     def write_file(self, name, content):
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index c5cefc06ea23d0892804b251d21cedd6cd6d67e8..9e6ada3b86c6d57757d5ca814a481b41a571d0c5 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -124,7 +124,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
     elif target != kernel_family.get_ast_attr('target'):
         raise ValueError('Mismatch between target parameter and AST targets.')
 
-    if not generation_context.cuda and target == Target.GPU:
+    if not (generation_context.cuda or generation_context.hip) and target == Target.GPU:
         return
 
     representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
@@ -152,7 +152,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
     header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
     source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
 
-    source_extension = "cpp" if target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
@@ -344,7 +344,7 @@ def generate_pack_info(generation_context, class_name: str,
     header = env.get_template(template_name + ".h").render(**jinja_context)
     source = env.get_template(template_name + ".cpp").render(**jinja_context)
 
-    source_extension = "cpp" if config.target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
@@ -446,14 +446,16 @@ class KernelInfo:
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
+
+            block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
+            grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
+
+            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
             kernel_call_lines = [
-                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                  for e in indexing_dict['block']),
-                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                 for e in indexing_dict['grid']),
-                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                    stream, call_parameters),
-            ]
+                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
+                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
+                kernel_launch]
 
             return "\n".join(kernel_call_lines)
         else:
@@ -477,9 +479,9 @@ def get_vectorize_instruction_set(generation_context):
 def config_from_context(generation_context, target=Target.CPU, data_type=None,
                         cpu_openmp=None, cpu_vectorize_info=None, **kwargs):
 
-    if target == Target.GPU and not generation_context.cuda:
-        raise ValueError("can not generate cuda code if waLBerla is not build with CUDA. Please use "
-                         "-DWALBERLA_BUILD_WITH_CUDA=1 for configuring cmake")
+    if target == Target.GPU and not generation_context.gpu:
+        raise ValueError("can not generate device code if waLBerla is not build with CUDA or HIP. Please use "
+                         "-DWALBERLA_BUILD_WITH_CUDA=1 or -DWALBERLA_BUILD_WITH_HIP=1 for configuring cmake")
 
     default_dtype = "float64" if generation_context.double_accuracy else "float32"
     if data_type is None:
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index 0ee3f3cd128e2b9e9f7c90cbb685457022f4529f..61ca6f12ac1eabfc6c1be09aad836b424c34aaee 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -1,5 +1,3 @@
-import jinja2
-
 # For backward compatibility with version < 3.0.0
 try:
     from jinja2 import pass_context as jinja2_context_decorator
@@ -58,7 +56,7 @@ def translate_target(target):
 
 def make_field_type(dtype, f_size, is_gpu):
     if is_gpu:
-        return f"cuda::GPUField<{dtype}>"
+        return f"gpu::GPUField<{dtype}>"
     else:
         return f"field::GhostLayerField<{dtype}, {f_size}>"
 
@@ -236,7 +234,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
         cell_interval: Defines the name (string) of a walberla CellInterval object in scope,
                        that defines the inner region for the kernel to loop over. Parameter has to be left to default
                        if ghost_layers_to_include is specified.
-        stream: optional name of cuda stream variable
+        stream: optional name of gpu stream variable
         spatial_shape_symbols: relevant only for gpu kernels - to determine CUDA block and grid sizes the iteration
                                region (i.e. field shape) has to be known. This can normally be inferred by the kernel
                                parameters - however in special cases like boundary conditions a manual specification
@@ -305,21 +303,21 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
                 coord_set = set(coordinates)
                 coord_set = sorted(coord_set, key=lambda e: str(e))
                 for c in coord_set:
-                    kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({c}, -{actual_gls});")
+                    kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({c}, -{actual_gls})")
                 while len(coordinates) < 4:
                     coordinates.append(0)
                 coordinates = tuple(coordinates)
                 kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name}->dataAt"
                                          f"({coordinates[0]}, {coordinates[1]}, {coordinates[2]}, {coordinates[3]});")
                 if assume_inner_stride_one and field.index_dimensions > 0:
-                    kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx);")
+                    kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx)")
                 if instruction_set and assume_aligned:
                     if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{instruction_set['cachelineSize']}, 0);")
+                                                 f"{instruction_set['cachelineSize']}, 0)")
                     else:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{instruction_set['bytes']}, 0);")
+                                                 f"{instruction_set['bytes']}, 0)")
         elif param.is_field_stride:
             casted_stride = get_field_stride(param)
             type_str = param.symbol.dtype.c_name
@@ -331,17 +329,17 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
             shape = f"{type_str}({get_end_coordinates(field)[coord]})"
             assert coord < 3
             max_value = f"{field.name}->{('x', 'y', 'z')[coord]}SizeWithGhostLayer()"
-            kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape});")
+            kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape})")
             kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {shape};")
             if assume_inner_stride_one and field.index_dimensions > 0:
-                kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx);")
+                kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx)")
             if instruction_set and assume_aligned:
                 if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{instruction_set['cachelineSize']}, 0);")
+                                             f"{instruction_set['cachelineSize']}, 0)")
                 else:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{instruction_set['bytes']}, 0);")
+                                             f"{instruction_set['bytes']}, 0)")
 
     kernel_call_lines.append(kernel.generate_kernel_invocation_code(stream=stream,
                                                                     spatial_shape_symbols=spatial_shape_symbols))
@@ -357,11 +355,15 @@ def generate_swaps(kernel_info):
     return swaps
 
 
+# TODO: basically 3 times the same code :(
 def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=None):
     if parameters_to_ignore is None:
         parameters_to_ignore = []
 
-    parameters_to_ignore += kernel_info.temporary_fields
+    varying_parameter_names = []
+    if hasattr(kernel_info, 'varying_parameters'):
+        varying_parameter_names = tuple(e[1] for e in kernel_info.varying_parameters)
+    parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names
 
     parameter_initializer_list = []
     # First field pointer
diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py
index b2e831cc8e6a8fb111d9620c154d98ef1c1c7717..c62f441775edc763f4ad41b2e9c218a5b86930d8 100644
--- a/python/pystencils_walberla/kernel_selection.py
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -153,14 +153,16 @@ class KernelCallNode(AbstractKernelSelectionNode):
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
+
+            block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
+            grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
+
+            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
             kernel_call_lines = [
-                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                  for e in indexing_dict['block']),
-                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                 for e in indexing_dict['grid']),
-                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                    stream, call_parameters),
-            ]
+                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
+                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
+                kernel_launch]
 
             return "\n".join(kernel_call_lines)
         else:
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.cpp b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
index a7b1c064ef752aff2a5b1edc435aedddc7d6d966..b4fe6df4794d61f06c2a1a900879871e7ac9f14d 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
@@ -23,7 +23,7 @@
 #include "core/Macros.h"
 #include "{{class_name}}.h"
 {% if target == 'gpu' -%}
-#include "cuda/ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
 {%- endif %}
 
 
@@ -67,7 +67,7 @@ namespace {{namespace}} {
 
 void {{class_name}}::run_impl(
    {{- ["IBlock * block", "IndexVectors::Type type",
-        kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []]
+        kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []]
        | type_identifier_list -}}
 )
 {
@@ -90,21 +90,21 @@ void {{class_name}}::run_impl(
 }
 
 void {{class_name}}::run(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::ALL", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
 void {{class_name}}::inner(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::INNER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
 void {{class_name}}::outer(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::OUTER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index f454a975da5bbb9cd7649378c4644dfb8c9d53a9..96a9202c19345f0e36c5e048be1ee65969f5c966 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -23,8 +23,9 @@
 {% if target is equalto 'cpu' -%}
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
 {%- endif %}
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -75,7 +76,7 @@ public:
         {% if target == 'gpu' -%}
         ~IndexVectors() {
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+               WALBERLA_GPU_CHECK(gpuFree( gpuVec ));
         }
         {% endif -%}
 
@@ -90,7 +91,7 @@ public:
         {
             {% if target == 'gpu' -%}
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+               WALBERLA_GPU_CHECK(gpuFree( gpuVec ));
             gpuVectors_.resize( cpuVectors_.size() );
 
             WALBERLA_ASSERT_EQUAL(cpuVectors_.size(), NUM_TYPES);
@@ -98,8 +99,8 @@ public:
             {
                 auto & gpuVec = gpuVectors_[i];
                 auto & cpuVec = cpuVectors_[i];
-                cudaMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
-                cudaMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), cudaMemcpyHostToDevice );
+                WALBERLA_GPU_CHECK(gpuMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() ));
+                WALBERLA_GPU_CHECK(gpuMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), gpuMemcpyHostToDevice ));
             }
             {%- endif %}
         }
@@ -122,12 +123,12 @@ public:
     };
 
     void run (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     {% if generate_functor -%}
     void operator() (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     )
     {
         run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
@@ -135,28 +136,28 @@ public:
     {%- endif %}
 
     void inner (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     void outer (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
-    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ]
                (IBlock * b)
                { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
                { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
@@ -299,7 +300,7 @@ public:
 private:
     void run_impl(
         {{- ["IBlock * block", "IndexVectors::Type type",
-             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+             kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []]
             | type_identifier_list -}}
    );
 
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
index db79ae375bf8c815da1623abe6961aa727bc1ede..054d589ecbc43addfbd20a6009c65d873f56e802 100644
--- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
@@ -1,9 +1,11 @@
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
 #include "core/DataTypes.h"
-#include "{{class_name}}.h"
+#include "core/cell/CellInterval.h"
 
+#include "stencil/Directions.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "{{class_name}}.h"
 
 {% if target is equalto 'cpu' -%}
 #define FUNC_PREFIX
@@ -29,7 +31,7 @@ using walberla::stencil::Direction;
 
 
 
-void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream)
 {
     {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
 
@@ -59,7 +61,7 @@ void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * b
 }
 
 
-void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream)
 {
     {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
 
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
index 8b70e1cb8dce1898f8a5c955c59f810bc3353aa8..2b182905cd8584794ba53108f072f1da5abb37bc 100644
--- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
@@ -1,11 +1,14 @@
 #pragma once
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
 #include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
 #include "domain_decomposition/IBlock.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
 
+#include "stencil/Directions.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
 
 {% if target is equalto 'cpu' -%}
 #define FUNC_PREFIX
@@ -25,7 +28,7 @@ namespace walberla {
 namespace {{namespace}} {
 
 
-class {{class_name}} : public ::walberla::cuda::GeneratedGPUPackInfo
+class {{class_name}} : public ::walberla::gpu::GeneratedGPUPackInfo
 {
 public:
     {{class_name}}( {{fused_kernel|generate_constructor_parameters(parameters_to_ignore=['buffer'])}} )
@@ -33,8 +36,8 @@ public:
     {};
     virtual ~{{class_name}}() {}
 
-    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
-    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream);
+    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream);
     virtual uint_t size  (stencil::Direction dir, IBlock * block);
 
 private:
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
index 96e589e1e549bd3ad28075e663ff399131dc4a33..10e180fb56a916f79352660fd0bbdd0c3b136c01 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -56,7 +56,7 @@ namespace {{namespace}} {
 
 {{kernel|generate_definitions(target, max_threads)}}
 
-void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
     {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
@@ -67,7 +67,7 @@ void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_paramet
 
 void {{class_name}}::runOnCellInterval(
     {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
-         kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] 
+         kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] 
         | type_identifier_list -}}
 )
 {
@@ -86,7 +86,7 @@ void {{class_name}}::runOnCellInterval(
 }
 
 {%if inner_outer_split%}
-void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
@@ -98,7 +98,7 @@ void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_param
 }
 
 
-void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h
index 093e2332ac4e5925305701a4f0d54887f18dffe1..599ade337a0b9ebf08d67d247bc4e0474c2b7ead 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.h
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.h
@@ -23,9 +23,10 @@
 {% if target is equalto 'cpu' -%}
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
 {% if inner_outer_split -%}
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
 {%- endif %}
 {%- endif %}
 #include "field/SwapableCompare.h"
@@ -65,17 +66,17 @@ public:
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
 
-    void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
     
     void runOnCellInterval(
         {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
-             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] 
+             kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] 
             | type_identifier_list -}}
     );
 
     {% if generate_functor %}
     void operator() (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     )
     {
         run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
@@ -91,14 +92,14 @@ public:
                { kernel->run( {{- [ ['b'], kernel.kernel_selection_parameters] | identifier_list -}} ); };
     }
 
-    static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, cudaStream_t {% endif -%} )> getSweepOnCellInterval(
+    static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, gpuStream_t {% endif -%} )> getSweepOnCellInterval(
         {{- ["const shared_ptr<" + class_name + "> & kernel", "const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
              kernel.kernel_selection_parameters, 'cell_idx_t ghostLayers=1']
             | type_identifier_list -}}
     )
     {
         return [ {{- ["kernel", "blocks", "globalCellInterval", "ghostLayers", kernel.kernel_selection_parameters] | identifier_list -}} ]
-               (IBlock * b{%- if target is equalto 'gpu'%}, cudaStream_t stream = nullptr{% endif -%}) 
+               (IBlock * b{%- if target is equalto 'gpu'%}, gpuStream_t stream = nullptr{% endif -%}) 
                { kernel->runOnCellInterval(
                     {{- ["blocks", "globalCellInterval", "ghostLayers", "b", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []]
                         | identifier_list 
@@ -106,7 +107,7 @@ public:
                 ); };
     }
 
-    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] 
                (IBlock * b) 
@@ -115,7 +116,7 @@ public:
 
     std::function<void (IBlock *)> getSweepOnCellInterval(
         {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
-             interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+             interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["gpuStream_t stream = nullptr"] if target == 'gpu' else []]
             | type_identifier_list -}}
     )
     {
@@ -125,18 +126,18 @@ public:
     }
 
 {% if inner_outer_split %}
-    void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
                (IBlock * b) 
                { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
                (IBlock * b) 
@@ -152,7 +153,7 @@ public:
     {{kernel|generate_members|indent(4)}}
 
 private:
-    {%if target is equalto 'gpu' -%} cuda::ParallelStreams parallelStreams_; {%- endif %}
+    {%if target is equalto 'gpu' -%} gpu::ParallelStreams parallelStreams_; {%- endif %}
 
     Cell outerWidth_;
     std::vector<CellInterval> layers_;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 93a2393761b4e5a1fb72254fbfd5714637aad2bd..891a92988bbd994f3d69c4deacac7ff08ce46362 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,8 +24,8 @@ add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( communication )
 add_subdirectory( core )
-if ( CMAKE_CUDA_COMPILER )
-   add_subdirectory( cuda )
+if ( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+   add_subdirectory(gpu)
 endif()
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
diff --git a/src/core/timing/RemainingTimeLogger.h b/src/core/timing/RemainingTimeLogger.h
index c20b715cdf21ddd612e7b29a43bb12fc4a3b96bf..cc518d0739d2c45f606f414b1498add4e1776038 100644
--- a/src/core/timing/RemainingTimeLogger.h
+++ b/src/core/timing/RemainingTimeLogger.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -26,9 +26,10 @@
 
 #include <iomanip>
 
-
-namespace walberla {
-namespace timing {
+namespace walberla
+{
+namespace timing
+{
 
 /***********************************************************************************************************************
  * \brief Functor that can be added to a time loop in order to print an estimated remaining runtime.
@@ -43,18 +44,18 @@ namespace timing {
  **********************************************************************************************************************/
 class RemainingTimeLogger
 {
-public:
-
-   RemainingTimeLogger( const uint_t nrTimesteps, const double logIntervalInSec = 10, const int minOutputWidth = 8, const uint_t startTimestep = 0 )
-      : timeSinceLastLog_( 0.0 ), logIntervalInSec_( logIntervalInSec ),
-        timestep_( startTimestep ), nrTimesteps_( nrTimesteps ), minOutputWidth_( minOutputWidth ), firstRun_( true )
+ public:
+   RemainingTimeLogger(const uint_t nrTimesteps, const real_t logIntervalInSec = 10, const int minOutputWidth = 8,
+                       const uint_t startTimestep = 0)
+      : logIntervalInSec_(logIntervalInSec), timestep_(startTimestep), nrTimesteps_(nrTimesteps),
+        minOutputWidth_(minOutputWidth)
    {}
 
-   void operator() ()
+   void operator()()
    {
       WALBERLA_ROOT_SECTION()
       {
-         if( firstRun_ )
+         if (firstRun_)
          {
             timer_.start();
             firstRun_ = false;
@@ -65,37 +66,37 @@ public:
          timer_.end();
          ++timestep_;
 
-         timeSinceLastLog_ += timer_.last();
+         timeSinceLastLog_ += real_c(timer_.last());
 
-         if( timeSinceLastLog_ > logIntervalInSec_)
+         if (timeSinceLastLog_ > logIntervalInSec_)
          {
             timeSinceLastLog_ = 0.0;
 
-            uint_t timeStepsRemaining = nrTimesteps_ - timestep_;
+            uint_t const timeStepsRemaining = nrTimesteps_ - timestep_;
 
-            double remainingTime = timer_.average() * double_c( timeStepsRemaining );
-            WALBERLA_LOG_INFO( "Estimated Remaining Time: " << std::setw( minOutputWidth_ ) << std::right
-                                                            << timing::timeToString( real_c(remainingTime) ) );
+            real_t const remainingTime = real_c(timer_.average()) * real_c(timeStepsRemaining);
+            WALBERLA_LOG_INFO("Estimated Remaining Time: " << std::setw(minOutputWidth_) << std::right
+                                                           << timing::timeToString(remainingTime))
          }
 
          timer_.start();
       }
    }
 
-private:
-
+ private:
    WcTimer timer_;
-   double  timeSinceLastLog_;
-   double  logIntervalInSec_;
-   uint_t  timestep_;
-   uint_t  nrTimesteps_;
-   int     minOutputWidth_;
-   bool    firstRun_;
+   real_t timeSinceLastLog_{ 0.0 };
+   real_t logIntervalInSec_;
+   uint_t timestep_;
+   uint_t nrTimesteps_;
+   int minOutputWidth_;
+   bool firstRun_{ true };
 };
 
 } // namespace timing
 } // namespace walberla
 
-namespace walberla {
-   using timing::RemainingTimeLogger;
+namespace walberla
+{
+using timing::RemainingTimeLogger;
 }
diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h
deleted file mode 100644
index 5e1d7a3e717b3d390ea19bf9c3861f5d31c6f7f6..0000000000000000000000000000000000000000
--- a/src/cuda/CudaRAII.h
+++ /dev/null
@@ -1,115 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file CudaRAII.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-#pragma once
-
-#include "ErrorChecking.h"
-
-namespace walberla {
-namespace cuda {
-
-
-   class StreamRAII
-   {
-   public:
-      ~StreamRAII()
-      {
-         if( stream_ != 0 ) {
-            WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
-         }
-      }
-
-      StreamRAII( StreamRAII &&other )
-      {
-         stream_ = other.stream_;
-         other.stream_ = 0;
-      }
-
-      StreamRAII( const StreamRAII & ) = delete;
-
-      void operator=( const StreamRAII & ) = delete;
-
-      operator cudaStream_t() const { return stream_; }
-
-
-      static StreamRAII defaultStream()
-      {
-         StreamRAII result;
-         result.stream_ = 0;
-         return result;
-      }
-
-      static StreamRAII newPriorityStream( int priority )
-      {
-         StreamRAII result;
-         WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
-         return result;
-      }
-
-      static StreamRAII newStream()
-      {
-         StreamRAII result;
-         WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_ ));
-         return result;
-      }
-
-   private:
-      StreamRAII() {}
-
-      cudaStream_t stream_;
-   };
-
-
-   class EventRAII
-   {
-   public:
-      explicit EventRAII()
-      {
-         event = cudaEvent_t();
-         WALBERLA_CUDA_CHECK( cudaEventCreate( &event ));
-      }
-
-      ~EventRAII()
-      {
-         if( event != cudaEvent_t() )
-         {
-            WALBERLA_CUDA_CHECK( cudaEventDestroy( event ));
-         }
-      }
-
-      EventRAII( const EventRAII & ) = delete;
-
-      void operator=( const EventRAII & ) = delete;
-
-      EventRAII( EventRAII &&other )
-      {
-         event = other.event;
-         other.event = cudaEvent_t();
-      }
-
-      operator cudaEvent_t() const { return event; }
-
-   private:
-      cudaEvent_t event;
-   };
-
-
-} // namespace cuda
-} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ExecutionTreeGPU.h b/src/cuda/ExecutionTreeGPU.h
deleted file mode 100644
index 9c865378cda0ce1125883ad1970dbda11d286f61..0000000000000000000000000000000000000000
--- a/src/cuda/ExecutionTreeGPU.h
+++ /dev/null
@@ -1,203 +0,0 @@
-//==============================================================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ExecutionTreeGPU.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//==============================================================================================================================================================
-
-#pragma once
-
-#include "executiontree/ExecutionTree.h"
-#include "ParallelStreams.h"
-
-#include <cuda_runtime.h>
-
-#ifdef CUDART_VERSION
-#if CUDART_VERSION <= 9020
-cudaError_t cudaLaunchHostFunc( cudaStream_t,  void(CUDART_CB* )( void*  userData ), void* ) {
-        static bool printedWarning = false;
-        if( ! printedWarning ) {
-                WALBERLA_LOG_WARNING_ON_ROOT("Timing of CUDA functions only implemented for CUDA versions >= 10.0" );
-                printedWarning = true;
-        }
-        return cudaSuccess;
-}
-#endif
-#endif
-
-namespace walberla {
-namespace executiontree {
-
-// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
-
-using executiontree::IFunctionNode;
-using executiontree::IFunctionNodePtr;
-using executiontree::TimingTreePtr;
-
-class SequenceCUDA;
-class IFunctionNodeCUDA;
-template<typename FunctorClass> class FunctorCUDA;
-using IFunctionNodeCUDAPtr = shared_ptr<IFunctionNodeCUDA>;
-
-
-// -------------------------------------- Public Interface     ------------------------------------------------------------------------------------------------
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
-
-
-shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
-                                         const std::string &name, cudaStream_t defaultStream = 0, bool parallel = false, int priority = 0,
-                                         const TimingTreePtr &timingTree = nullptr );
-
-
-// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
-
-
-class IFunctionNodeCUDA : public IFunctionNode
-{
-public:
-   virtual void operator()( cudaStream_t ) = 0;
-};
-
-template<typename FunctorClass>
-void CUDART_CB functorCUDAStartTimer(void *data)
-{
-   auto functor = reinterpret_cast<FunctorClass *>( data );
-   functor->timingTree_->start( functor->getName() );
-}
-
-template<typename FunctorClass>
-void CUDART_CB functorCUDAStopTimer(void *data)
-{
-   auto functor = reinterpret_cast<FunctorClass *>( data );
-   functor->timingTree_->stop( functor->getName() );
-}
-
-template<typename FunctorType>
-class FunctorCUDA : public IFunctionNodeCUDA
-{
-public:
-   FunctorCUDA( const FunctorType &functor,
-                const std::string &name,
-                const TimingTreePtr &timingTree )
-      : functor_( functor ), name_( name ), timingTree_( timingTree ) {}
-
-   void operator() (cudaStream_t stream) override
-   {
-      if ( timingTree_ )
-      {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer<FunctorCUDA<FunctorType> >, this ) );
-         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer<FunctorCUDA<FunctorType> >, this ) );
-      }
-      else
-         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
-   }
-
-   const std::string getName() const override { return name_ != "" ? name_ : "FunctorCUDA"; };
-   void operator() () override {  (*this)( 0 );  }
-
-private:
-   friend void CUDART_CB functorCUDAStartTimer<FunctorCUDA<FunctorType> >(void *data);
-   friend void CUDART_CB functorCUDAStopTimer<FunctorCUDA<FunctorType> >(void *data);
-
-   FunctorType functor_;
-   std::string name_;
-   shared_ptr< WcTimingTree > timingTree_;
-};
-
-
-class SequenceCUDA : public IFunctionNodeCUDA
-{
-public:
-   SequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList, const std::string &name, cudaStream_t defaultStream,
-                 bool parallel = false, int priority=0,
-                 const TimingTreePtr &timingTree = nullptr)
-      : name_( name ), defaultStream_( defaultStream), timingTree_( timingTree ), parallelStreams_( priority ), parallel_( parallel ), priority_(priority)
-   {
-      for ( auto &e : initializerList )
-         children_.push_back( e );
-   }
-
-
-   void operator() (cudaStream_t stream) override
-   {
-      if ( timingTree_ ) {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer< SequenceCUDA >, this ));
-      }
-
-      if( parallel_ )
-      {
-         auto parallelSection = parallelStreams_.parallelSection( stream );
-         for ( auto &el : children_ )
-         {
-            ( *el )( parallelSection.stream());
-            parallelSection.next();
-         }
-      }
-      else
-         for ( auto &el : children_ )
-            (*el)( stream );
-
-      if ( timingTree_ ) {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer< SequenceCUDA >, this ));
-      }
-   }
-
-   void operator() () override {  (*this)( defaultStream_ );  }
-   void push_back( const IFunctionNodeCUDAPtr &fct ) { children_.push_back( fct ); }
-   void push_front( const IFunctionNodeCUDAPtr &fct ) { children_.push_front( fct ); }
-   const std::string getName() const override { return name_ != "" ? name_ : "ParallelSequenceCUDA"; };
-   const std::deque< IFunctionNodePtr > getChildren() const override {
-      std::deque< IFunctionNodePtr > result;
-      for( auto & c : children_ )
-         result.push_back( c );
-      return result;
-   };
-
-private:
-   friend void CUDART_CB functorCUDAStartTimer< SequenceCUDA >( void *data );
-   friend void CUDART_CB functorCUDAStopTimer< SequenceCUDA >( void *data );
-
-   std::string name_;
-   cudaStream_t defaultStream_;
-   std::deque< IFunctionNodeCUDAPtr > children_;
-   shared_ptr< WcTimingTree > timingTree_;
-   cuda::ParallelStreams parallelStreams_;
-   bool parallel_;
-   int priority_;
-};
-
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
-{
-   return make_shared<FunctorCUDA<FunctorType> >( t, name, timingTree );
-}
-
-
-shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
-                                         const std::string &name, cudaStream_t defaultStream, bool parallel, int priority,
-                                         const TimingTreePtr &timingTree )
-{
-   return make_shared< SequenceCUDA >( initializerList, name, defaultStream, parallel, priority, timingTree );
-}
-
-
-} // namespace executiontree
-} // namespace walberla
diff --git a/src/cuda/ExecutionTreeSweepGPU.h b/src/cuda/ExecutionTreeSweepGPU.h
deleted file mode 100644
index 6f97277c4b75a5fe2dcdf0ec383ed8217699f1b2..0000000000000000000000000000000000000000
--- a/src/cuda/ExecutionTreeSweepGPU.h
+++ /dev/null
@@ -1,105 +0,0 @@
-//==============================================================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ExecutionTreeSweepGPU.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//==============================================================================================================================================================
-
-#pragma once
-
-#include "domain_decomposition/IBlock.h"
-#include "executiontree/ExecutionTree.h"
-#include "ExecutionTreeGPU.h"
-
-namespace walberla {
-namespace executiontree {
-
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name = "",
-                                const TimingTreePtr &tt = nullptr );
-
-
-template<typename FunctorType>
-class SweepCUDA : public IFunctionNodeCUDA
-{
-public:
-   SweepCUDA( BlockStorage &bs,
-              const FunctorType &functor,
-              const std::string &name,
-              const TimingTreePtr &timingTree )
-      : blockStorage_( bs ),
-        functor_( functor ),
-        name_( name ),
-        timingTree_( timingTree ) {}
-
-   SweepCUDA( const shared_ptr <StructuredBlockStorage> &bs,
-              const FunctorType &functor,
-              const std::string &name,
-              const TimingTreePtr &timingTree )
-      : blockStorage_( bs->getBlockStorage()),
-        functor_( functor ),
-        name_( name ),
-        timingTree_( timingTree ) {}
-
-   void operator() () override {  (*this)( 0 );  }
-
-   void operator()( cudaStream_t stream ) override
-   {
-      if ( timingTree_ )
-      {
-         for ( auto &block: blockStorage_ )
-         {
-            timingTree_->start( name_ );
-            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
-            timingTree_->stop( name_ );
-         }
-      }
-      else
-         for ( auto &block: blockStorage_ )
-            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
-   }
-
-   const std::string getName() const override { return name_ != "" ? name_ : "Sweep"; };
-
-private:
-   BlockStorage &blockStorage_;
-
-   FunctorType functor_;
-   std::string name_;
-   TimingTreePtr timingTree_;
-};
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, FunctorType t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
-{
-   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
-}
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name,
-                                const TimingTreePtr &timingTree )
-{
-   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
-}
-
-
-} // namespace executiontree
-} // namespace walberla
diff --git a/src/cuda/FieldIndexing.h b/src/cuda/FieldIndexing.h
deleted file mode 100644
index 229d3b36b752d86826699430bf6734f8de8a8c4c..0000000000000000000000000000000000000000
--- a/src/cuda/FieldIndexing.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FieldIndexing.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FieldAccessor.h"
-
-#include "stencil/Directions.h"
-#include <cuda_runtime.h>
-
-namespace walberla { namespace cell {  class CellInterval;  } }
-
-namespace walberla {
-namespace cuda {
-
-   // Forward Declarations
-   template< typename T> class GPUField;
-
-   template<typename T>
-   class FieldIndexing
-   {
-   public:
-
-      //** Kernel call        ******************************************************************************************
-      /*! \name Kernel call  */
-      //@{
-      dim3 blockDim() const                      { return blockDim_; }
-      dim3 gridDim () const                      { return gridDim_;  }
-
-      const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; }
-      //@}
-      //****************************************************************************************************************
-
-
-
-
-      //** Creation        *********************************************************************************************
-      /*! \name Creation  */
-      //@{
-      static FieldIndexing<T> interval ( const GPUField<T> & f,
-                                               const cell::CellInterval & ci,
-                                               int fBegin=0, int fEnd=1 );
-
-
-      static FieldIndexing<T> xyz ( const GPUField<T> & f );
-      static FieldIndexing<T> withGhostLayerXYZ       ( const GPUField<T> & f, uint_t numGhostLayers );
-      static FieldIndexing<T> ghostLayerOnlyXYZ       ( const GPUField<T> & f, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-      static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-      static FieldIndexing<T> sliceXYZ                ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-
-      static FieldIndexing<T> allInner          ( const GPUField<T> & f );
-      static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f );
-      static FieldIndexing<T> all               ( const GPUField<T> & f, const cell::CellInterval & ci );
-      //@}
-      //****************************************************************************************************************
-
-   protected:
-      FieldIndexing ( const GPUField<T> & field,
-                      dim3 _blockDim, dim3 _gridDim,
-                      const FieldAccessor<T> _gpuAccess );
-
-      const GPUField<T> &  field_;
-      dim3 blockDim_;
-      dim3 gridDim_;
-      FieldAccessor<T> gpuAccess_;
-   };
-
-
-} // namespace cuda
-} // namespace walberla
-
-#include "FieldIndexing.impl.h"
-
diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp
deleted file mode 100644
index 834150fdcb921064392fb99d6690752b6eaaee3c..0000000000000000000000000000000000000000
--- a/src/cuda/GPUCopy.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUCopy.cpp
-//! \ingroup cuda
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
-//! \brief Copy routines of 4D intervals involving GPU buffers.
-//
-//======================================================================================================================
-
-#include "core/debug/Debug.h"
-
-#include "GPUCopy.h"
-#include "ErrorChecking.h"
-
-#include <cstring>
-
-
-namespace walberla {
-namespace cuda {
-
-void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
-
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyDeviceToDevice;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, uint_c(1) );
-      }
-   }
-}
-
-
-void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-      p.kind = cudaMemcpyDeviceToDevice;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-
-void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyHostToDevice;
-
-      if (copyStream == 0)
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, uint_c(1) );
-      }
-   }
-}
-
-void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-         cudaMemcpy3DParms p;
-         std::memset( &p, 0, sizeof(p) );
-
-         p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-         p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
-
-         p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-         p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-         p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-         p.kind = cudaMemcpyHostToDevice;
-
-         if ( copyStream == 0 )
-         {
-            WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-         }
-         else
-         {
-            // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-            WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-         }
-   };
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-
-void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyDeviceToHost;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, 1 );
-      }
-   }
-}
-
-
-void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-      p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
-
-      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-
-      p.kind = cudaMemcpyDeviceToHost;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-} // namespace cuda
-} // namespace walberla
diff --git a/src/cuda/doc/cuda.dox b/src/cuda/doc/cuda.dox
deleted file mode 100644
index 96652834d67ca2a6caa342855d77981f52d7f214..0000000000000000000000000000000000000000
--- a/src/cuda/doc/cuda.dox
+++ /dev/null
@@ -1,80 +0,0 @@
-
-namespace walberla{
-/*!
-
-\page cudaPage Overview of waLBerla CUDA support
-
-\brief waLBerla CUDA concepts
-
-
-\section cudaField Fields on GPU
-
-
-\subsection cudaFieldOverview Creating GPU fields and copy them between host and device
-
-   \code
-   // create a CPU field and a GPU field of same size and with same layout
-   GhostLayerField<double,4> h_f ( 16,20,30,    1, 42.0, field::fzyx );
-   cuda::GPUField<double>    d_f ( 16,20,30, 4, 1,  field::fzyx );
-
-   cuda::fieldCpy( d_f, h_f ); // copy from host to device
-   some_kernel_wrapper( d_f ); // run some kernel
-   cuda::fieldCpy( h_f, d_f ); // copy field data back to host
-   \endcode
-
-   Similarities and Differences of CPU and GPU field
-   - cuda::GPUField corresponds to field::GhostLayerField
-   - fSize is a template parameter for CPU fields and a normal parameter for GPUFields
-   - CPU field iterators correspond to FieldAccessors (see next section)
-
-\subsection cudaFieldAccess Writing CUDA kernels operating on GPUFields
-
-  \image html cuda/doc/fieldAccess.png "Accessing fields in CUDA kernels"
-
-   When writing a kernel that operates on a field, the first task is to distribute the data to CUDA threads and blocks.
-   We need a function $(blockIdx, threadIdx) \\rightarrow (x,y,z)$ or $(blockIdx, threadIdx) \\rightarrow (x,y,z,f)$.
-   The optimal mapping depends on many parameters: for example which layout the field has, the extends of each coordinate,
-   hardware parameters like warp-size, etc.
-   Thus this indexing function is abstracted. A few indexing strategies are already implemented which can be
-   substituted by custom strategies.
-   A indexing strategy consists of two classes: and somewhat complex Indexing class, which manages the
-   indexing on the host-side and a lightweight Accessor class, which is passed to the CUDA kernel.
-
-   An indexing scheme is very similar to the iterator concept, it defines the bounds of the iteration, which is not necessarily the
-   complete field but could also be a certain sub-block, for example the ghost layer in a certain direction.
-
-
-   Lets start to write a simple kernel that doubles all values stored in a field:
-   \code
-   #include "cuda/FieldAccessor.h"
-
-   __global__ void kernel_double( cuda::FieldAccessor<double> f )
-   {
-      f.set( blockIdx, threadIdx );
-      f.get() *= 2.0;
-   }
-   \endcode
-   We do not have to care about indexing, the cuda::FieldAccessor takes care of that. So this is a generic kernel that operates
-   on double fields. Using the cuda::FieldAccessor the current and neighboring values can be accessed and manipulated.
-
-   This kernel can be called like this:
-   \code
-   cuda::FieldIndexing<double> indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true );
-   kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() );
-   \endcode
-   In the example above we only iterate over a slice of the field. Of course we can also iterate over the complete field, there are
-   various static member functions in a Indexing class to create certain iteration patterns.
-   The Indexing class encapsulates the information of how to launch the kernel (blockDim and gridDim) and holds the Accessor class that
-   is passed to the kernel.
-
-   Two indexing strategies are currently provided:
-      - cuda::FieldIndexing   and  cuda::FieldAccessor (general, but slow )
-      - cuda::FieldIndexingXYZ  and cuda::FieldAccessorXYZ ( optimized for cell based iterating over bigger chunks, for fields where xSize bigger than warpSize )
-
- \section cudaKernelWrapper Calling CUDA kernels from CPP files
-      \copydoc cuda::Kernel
-
-
-
-*/
-}
diff --git a/src/cuda/AddGPUFieldToStorage.h b/src/gpu/AddGPUFieldToStorage.h
similarity index 91%
rename from src/cuda/AddGPUFieldToStorage.h
rename to src/gpu/AddGPUFieldToStorage.h
index 3803ff8c814bb4df9e21312cbd00bee562100492..09736afad0e34e8c1cbd36db9590925e2bff3443 100644
--- a/src/cuda/AddGPUFieldToStorage.h
+++ b/src/gpu/AddGPUFieldToStorage.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AddGPUFieldToStorage.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,17 +22,17 @@
 #pragma once
 
 #include "GPUField.h"
-
 #include "domain_decomposition/StructuredBlockStorage.h"
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
    //*******************************************************************************************************************
-   /*! Adds a cuda::GPUField to a StructuredBlockStorage
+   /*! Adds a gpu::GPUField to a StructuredBlockStorage
    *
    *  - Similar to walberla::field::addToStorage() functions
    *  - created field is uninitialized
@@ -49,12 +49,12 @@ namespace cuda {
 
 
    //*******************************************************************************************************************
-   /*! Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field
+   /*! Adds a gpu::GPUField to a StructuredBlockStorage using data from a CPU field
    *
    *  - adds a GPU field to a StructuredBlockStorage using a CPU field
    *  - sizes, number of ghostlayers and layout are the same as the CPU field
    *  - GPU field is initialized with the data currently stored in the CPU field
-   *  @tparam Field_T  type of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type>
+   *  @tparam Field_T  type of the CPU field, the created GPUField will be of type gpu::GPUField<Field_T::value_type>
    */
    //*******************************************************************************************************************
    template< typename Field_T>
@@ -65,7 +65,7 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/AddGPUFieldToStorage.impl.h b/src/gpu/AddGPUFieldToStorage.impl.h
similarity index 96%
rename from src/cuda/AddGPUFieldToStorage.impl.h
rename to src/gpu/AddGPUFieldToStorage.impl.h
index 1befc3e81bc4a04fd89c1ec9561e1e417023cc05..e016f93fb47c34f0073d87813c96506bf11f58f8 100644
--- a/src/cuda/AddGPUFieldToStorage.impl.h
+++ b/src/gpu/AddGPUFieldToStorage.impl.h
@@ -14,17 +14,18 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AddGPUFieldToStorage.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "cuda/FieldCopy.h"
+#include "gpu/FieldCopy.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    namespace internal
@@ -57,7 +58,7 @@ namespace cuda {
          auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(),
                                          f->nrOfGhostLayers(), f->layout(), usePitchedMem );
 
-         cuda::fieldCpy( *gpuField, *f );
+         gpu::fieldCpy( *gpuField, *f );
 
          return gpuField;
       }
@@ -90,7 +91,7 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/AlignedAllocation.cpp b/src/gpu/AlignedAllocation.cpp
similarity index 82%
rename from src/cuda/AlignedAllocation.cpp
rename to src/gpu/AlignedAllocation.cpp
index db6ba38509779827c942d1242c569e87cf73440b..2a2bee41c7fdf96b88894a29d8dcd44100cff593 100644
--- a/src/cuda/AlignedAllocation.cpp
+++ b/src/gpu/AlignedAllocation.cpp
@@ -14,21 +14,21 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AlignedAllocation.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #include "AlignedAllocation.h"
-#include "cuda/ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
 #include "core/debug/CheckFunctions.h"
 #include "core/debug/Debug.h"
-#include "core/logging/Logging.h"
 
 #include <map>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    static std::map<void *, void*> freePointers_;
 
@@ -36,16 +36,16 @@ namespace cuda {
    {
       // With 0 alignment this function makes no sense
       // use normal malloc instead
-      WALBERLA_ASSERT_GREATER( alignment, 0 );
+      WALBERLA_ASSERT_GREATER( alignment, 0 )
       // Tests if alignment is power of two (assuming alignment>0)
-      WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+      WALBERLA_ASSERT( !(alignment & (alignment - 1)) )
 
-      WALBERLA_ASSERT_LESS( offset, alignment );
+      WALBERLA_ASSERT_LESS( offset, alignment )
 
       if( offset == 0 )
       {
          void * result = nullptr;
-         WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
+         WALBERLA_GPU_CHECK( gpuMalloc( &result, size ) )
          freePointers_[result] = result;
          return result;
       }
@@ -53,8 +53,8 @@ namespace cuda {
       void *pa;  // pointer to allocated memory
       void *ptr; // pointer to usable aligned memory
 
-      WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
-      WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
+      WALBERLA_GPU_CHECK( gpuMalloc( &pa, size + alignment ));
+      WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "GPU malloc did not return memory with requested alignment");
       ptr = (void *) ((char *) (pa) + alignment - offset);
       freePointers_[ptr] = pa;
 
@@ -67,7 +67,7 @@ namespace cuda {
    {
       // assume that pointer to real allocated chunk is stored just before
       // chunk that was given to user
-      WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
+      WALBERLA_GPU_CHECK( gpuFree( freePointers_[ptr] ));
       freePointers_.erase(ptr);
    }
 
@@ -87,6 +87,6 @@ namespace cuda {
    }
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
diff --git a/src/cuda/AlignedAllocation.h b/src/gpu/AlignedAllocation.h
similarity index 96%
rename from src/cuda/AlignedAllocation.h
rename to src/gpu/AlignedAllocation.h
index 6dfb45624cea7b2fd15de2e0dfed3c04a5993112..1e935b174c5514203c7618bafb27bd2b4df208e1 100644
--- a/src/cuda/AlignedAllocation.h
+++ b/src/gpu/AlignedAllocation.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AlignedAllocation.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -24,7 +24,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset );
 
@@ -35,5 +36,5 @@ namespace cuda {
    void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
                                        size_t alignment, size_t alignmentOffset );
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/CMakeLists.txt b/src/gpu/CMakeLists.txt
similarity index 73%
rename from src/cuda/CMakeLists.txt
rename to src/gpu/CMakeLists.txt
index bfefb9dcc680ac5506f2113108e7283efd8c579d..1790af12b470b84a9559de60931b88523956a694 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -1,24 +1,25 @@
 ###################################################################################################
 #
-# Module cuda
+# Module gpu
 #
 ###################################################################################################
 
-add_library( cuda )
-target_link_libraries( cuda PUBLIC blockforest core communication domain_decomposition executiontree field stencil lbm )
-target_sources( cuda
+add_library( gpu )
+target_link_libraries( gpu PUBLIC blockforest core communication domain_decomposition executiontree field stencil lbm )
+
+# sources for HIP and CUDA
+target_sources( gpu
       PRIVATE
       AlignedAllocation.h
       AddGPUFieldToStorage.h
       ErrorChecking.h
-      ExecutionTreeGPU.h
       FieldCopy.h
       GPUCopy.cpp
-      NVTX.h
       FieldIndexingXYZ.h
       FieldIndexing3D.h
       AddGPUFieldToStorage.impl.h
       GPUField.h
+      GPUWrapper.h
       FieldAccessor3D.h
       DeviceSelectMPI.h
       HostFieldAllocator.h
@@ -27,7 +28,6 @@ target_sources( cuda
       GPUCopy.h
       FieldAccessorXYZ.h
       FieldIndexingXYZ.impl.h
-      ExecutionTreeSweepGPU.h
       FieldIndexing.h
       AlignedAllocation.cpp
       GPUField.impl.h
@@ -39,6 +39,14 @@ target_sources( cuda
       DeviceSelectMPI.cpp
       )
 
+# sources only for CUDA
+if (WALBERLA_BUILD_WITH_CUDA)
+target_sources( gpu
+        PRIVATE
+        NVTX.h
+        )
+endif (WALBERLA_BUILD_WITH_CUDA)
+
 add_subdirectory( sweeps )
 add_subdirectory( communication )
 add_subdirectory( lbm )
diff --git a/src/gpu/CudaRAII.h b/src/gpu/CudaRAII.h
new file mode 100644
index 0000000000000000000000000000000000000000..815b3829114506a8c601669aa4195461bd60151a
--- /dev/null
+++ b/src/gpu/CudaRAII.h
@@ -0,0 +1,108 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CudaRAII.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "ErrorChecking.h"
+
+namespace walberla
+{
+namespace gpu
+{
+
+class StreamRAII
+{
+ public:
+   ~StreamRAII()
+   {
+      if (stream_ != nullptr) { WALBERLA_GPU_CHECK(gpuStreamDestroy(stream_)) }
+   }
+
+   StreamRAII(StreamRAII&& other) noexcept
+   {
+      stream_       = other.stream_;
+      other.stream_ = nullptr;
+   }
+
+   StreamRAII(const StreamRAII&) = delete;
+
+   void operator=(const StreamRAII&) = delete;
+
+   operator gpuStream_t() const { return stream_; }
+
+   static StreamRAII defaultStream()
+   {
+      StreamRAII result;
+      result.stream_ = nullptr;
+      return result;
+   }
+
+   static StreamRAII newPriorityStream(int priority)
+   {
+      StreamRAII result;
+      WALBERLA_GPU_CHECK(gpuStreamCreateWithPriority(&result.stream_, gpuStreamDefault, priority))
+      return result;
+   }
+
+   static StreamRAII newStream()
+   {
+      StreamRAII result;
+      WALBERLA_GPU_CHECK(gpuStreamCreate(&result.stream_))
+      return result;
+   }
+
+ private:
+   StreamRAII() = default;
+
+   gpuStream_t stream_;
+};
+
+class EventRAII
+{
+ public:
+   explicit EventRAII()
+   {
+      event = gpuEvent_t();
+      WALBERLA_GPU_CHECK(gpuEventCreate(&event))
+   }
+
+   ~EventRAII()
+   {
+      if (event != gpuEvent_t()) { WALBERLA_GPU_CHECK(gpuEventDestroy(event)) }
+   }
+
+   EventRAII(const EventRAII&) = delete;
+
+   void operator=(const EventRAII&) = delete;
+
+   EventRAII(EventRAII&& other) noexcept
+   {
+      event       = other.event;
+      other.event = gpuEvent_t();
+   }
+
+   operator gpuEvent_t() const { return event; }
+
+ private:
+   gpuEvent_t event;
+};
+
+} // namespace gpu
+} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/gpu/DeviceSelectMPI.cpp
similarity index 82%
rename from src/cuda/DeviceSelectMPI.cpp
rename to src/gpu/DeviceSelectMPI.cpp
index 3ba255d9f6fd926721477158243022c0012611ed..52454653b06a0895dc00ea65155c08f603aca303 100644
--- a/src/cuda/DeviceSelectMPI.cpp
+++ b/src/gpu/DeviceSelectMPI.cpp
@@ -14,18 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file DeviceSelectMPI.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #include "DeviceSelectMPI.h"
 #include "core/mpi/MPIWrapper.h"
-#include "cuda/ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 #if MPI_VERSION == 2 || MPI_VERSION == 1
 
@@ -39,8 +40,8 @@ void selectDeviceBasedOnMpiRank()
 {
 #ifdef WALBERLA_BUILD_WITH_MPI
    int deviceCount;
-   WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount ));
-   WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" );
+   WALBERLA_GPU_CHECK( gpuGetDeviceCount( &deviceCount ))
+   WALBERLA_LOG_INFO_ON_ROOT( "Selecting device depending on MPI Rank" )
 
    MPI_Info info;
    MPI_Info_create( &info );
@@ -54,19 +55,19 @@ void selectDeviceBasedOnMpiRank()
 
    if ( deviceCount == processesOnNode )
    {
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
+      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode ))
    }
    else if ( deviceCount > processesOnNode )
    {
       WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: "
-                               << processesOnNode << ", available GPUs on node: " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
+                               << processesOnNode << ", available GPUs on node: " << deviceCount )
+      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode ))
    }
    else
    {
       WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node "
-                               << processesOnNode << ", available GPUs on node " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ));
+                               << processesOnNode << ", available GPUs on node " << deviceCount )
+      WALBERLA_GPU_CHECK( gpuSetDevice( rankOnNode % deviceCount ))
    }
 #endif
 }
@@ -74,5 +75,5 @@ void selectDeviceBasedOnMpiRank()
 #endif
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/DeviceSelectMPI.h b/src/gpu/DeviceSelectMPI.h
similarity index 93%
rename from src/cuda/DeviceSelectMPI.h
rename to src/gpu/DeviceSelectMPI.h
index 06d4296726115a896e763e90feddbd0254aece30..34d763f93808cdeef1f9e5b2097de5047fb4a5b6 100644
--- a/src/cuda/DeviceSelectMPI.h
+++ b/src/gpu/DeviceSelectMPI.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file DeviceSelectMPI.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -23,11 +23,12 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 /**
- * Selects active CUDA device based on MPI rank
+ * Selects active GPU device based on MPI rank
  *
  * assumes that on each node there are as many MPI processes started as there are GPUs
  * - if there are more GPUs than processes on a node, a warning is printed and not all GPUs are utilized
@@ -36,5 +37,5 @@ namespace cuda {
  */
 void selectDeviceBasedOnMpiRank();
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ErrorChecking.h b/src/gpu/ErrorChecking.h
similarity index 75%
rename from src/cuda/ErrorChecking.h
rename to src/gpu/ErrorChecking.h
index 82dc0b4a913936eb2c4b8f9d01ccc0c1e0df2528..0d1316eccf17130e717824559a2f0e3051c834b2 100644
--- a/src/cuda/ErrorChecking.h
+++ b/src/gpu/ErrorChecking.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file ErrorChecking.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -24,30 +24,31 @@
 #include "core/Abort.h"
 
 #include <sstream>
-#include <cuda_runtime.h>
 
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
-#define WALBERLA_CUDA_CHECK(ans) { ::walberla::cuda::checkForError((ans), __FILE__, __LINE__); }
+#define WALBERLA_GPU_CHECK(ans) { ::walberla::gpu::checkForError((ans), __FILE__, __LINE__); }
 
 
 
-inline void checkForError( cudaError_t code, const std::string & callerPath, const int line )
+inline void checkForError( gpuError_t code, const std::string & callerPath, const int line )
 {
-  if(code != cudaSuccess)
+  if(code != gpuSuccess)
   {
     std::stringstream ss;
-    ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code );
+    ss << "GPU Error: " << code << " " << gpuGetErrorName(code) << ": " << gpuGetErrorString( code );
     Abort::instance()->abort( ss.str(), callerPath, line );
   }
 }
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldAccessor.h b/src/gpu/FieldAccessor.h
similarity index 98%
rename from src/cuda/FieldAccessor.h
rename to src/gpu/FieldAccessor.h
index c2c676d1d00a709cda8428d903914c5718999832..798440d1216c2e01b322c8f7c8a88033166e6eca 100644
--- a/src/cuda/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -14,19 +14,20 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessor.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include "core/DataTypes.h"
 
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
@@ -108,7 +109,7 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldAccessor3D.h b/src/gpu/FieldAccessor3D.h
similarity index 94%
rename from src/cuda/FieldAccessor3D.h
rename to src/gpu/FieldAccessor3D.h
index 411b64813e0b73531716cd09759852fabc5952e1..66e95f7242ce0c183ab261f7dcd1600a57df10bb 100644
--- a/src/cuda/FieldAccessor3D.h
+++ b/src/gpu/FieldAccessor3D.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessor3D.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -23,11 +23,9 @@
 
 #include "core/DataTypes.h"
 
-#include <cuda_runtime.h>
-
-
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename T>
@@ -43,7 +41,7 @@ namespace cuda {
                        const dim3 & idxDim,
                        const dim3 & blockDim )
             : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
-              idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false )
+              idxDim_( idxDim ), blockDim_( blockDim )
       {}
 
       __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
@@ -89,12 +87,12 @@ namespace cuda {
       uint_t fOffset_;
       dim3 idxDim_;
       dim3 blockDim_;
-      bool isValidPosition_;
+      bool isValidPosition_{ false };
    };
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldAccessorXYZ.h b/src/gpu/FieldAccessorXYZ.h
similarity index 96%
rename from src/cuda/FieldAccessorXYZ.h
rename to src/gpu/FieldAccessorXYZ.h
index 4e43dd1996600d1f24d3249104ef289eb9d8cfde..d9046a783d3b5c073b03527332cff553edf1e99b 100644
--- a/src/cuda/FieldAccessorXYZ.h
+++ b/src/gpu/FieldAccessorXYZ.h
@@ -14,20 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessorXYZ.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-
-#include <cuda_runtime.h>
-
 #include "core/DataTypes.h"
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename T>
@@ -73,7 +72,7 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldCopy.h b/src/gpu/FieldCopy.h
similarity index 86%
rename from src/cuda/FieldCopy.h
rename to src/gpu/FieldCopy.h
index 4f13fa999ffec19249183fecd2a4fe0939e2674e..13c079074899c23e338d97745967f39ee837fb3d 100644
--- a/src/cuda/FieldCopy.h
+++ b/src/gpu/FieldCopy.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldCopy.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -31,10 +31,10 @@
 #include "core/Abort.h"
 #include "core/logging/Logging.h"
 
-#include <cuda_runtime.h>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename DstType, typename SrcType>
@@ -76,12 +76,12 @@ namespace cuda {
 
 
    template<typename T, uint_t fs>
-   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src );
+   void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src );
 
 
 
    template<typename T, uint_t fs>
-   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src );
+   void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src );
 
 
 
@@ -96,14 +96,14 @@ namespace cuda {
 
 
    template<typename T, uint_t fs>
-   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src )
+   void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src )
    {
-      cudaMemcpy3DParms p;
+      gpuMemcpy3DParms p;
       memset( &p, 0, sizeof(p) );
 
 
       if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" );
+         WALBERLA_ABORT( "Cannot copy fields with different layout" )
       }
 
       bool canCopy = ( src.layout()     == fzyx &&
@@ -119,12 +119,12 @@ namespace cuda {
                         dst.fSize()      == src.fSize() );
 
       if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ");
+         WALBERLA_ABORT("Field have to have the same size ")
       }
 
       if ( dst.layout() == fzyx )
       {
-         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
+         p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()),          // pointer
                                          sizeof(T) * src.xAllocSize(), // pitch
                                          src.xAllocSize(),             // inner dimension size
                                          src.yAllocSize()  );          // next outer dimension size
@@ -135,7 +135,7 @@ namespace cuda {
       }
       else
       {
-         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
+         p.srcPtr = make_gpuPitchedPtr( (void*)(src.data()),          // pointer
                                          sizeof(T) * src.fAllocSize(), // pitch
                                          src.fAllocSize(),             // inner dimension size
                                          src.xAllocSize()  );          // next outer dimension size
@@ -146,20 +146,20 @@ namespace cuda {
       }
 
       p.dstPtr = dst.pitchedPtr();
-      p.kind = cudaMemcpyHostToDevice;
-      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
+      p.kind = gpuMemcpyHostToDevice;
+      WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) )
    }
 
 
 
    template<typename T, uint_t fs>
-   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src )
+   void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src )
    {
-      cudaMemcpy3DParms p;
+      gpuMemcpy3DParms p;
       memset( &p, 0, sizeof(p) );
 
       if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" );
+         WALBERLA_ABORT( "Cannot copy fields with different layout" )
       }
 
       bool canCopy = ( src.layout()     == fzyx &&
@@ -175,12 +175,12 @@ namespace cuda {
                         dst.fSize()      == src.fSize() );
 
       if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ");
+         WALBERLA_ABORT("Field have to have the same size ")
       }
 
       if ( dst.layout() == fzyx )
       {
-         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
+         p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()),          // pointer
                                          sizeof(T) * dst.xAllocSize(), // pitch
                                          dst.xAllocSize(),             // inner dimension size
                                          dst.yAllocSize()  );          // next outer dimension size
@@ -191,7 +191,7 @@ namespace cuda {
       }
       else
       {
-         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
+         p.dstPtr = make_gpuPitchedPtr( (void*)(dst.data()),          // pointer
                                          sizeof(T) * dst.fAllocSize(), // pitch
                                          dst.fAllocSize(),             // inner dimension size
                                          dst.xAllocSize()  );          // next outer dimension size
@@ -202,12 +202,12 @@ namespace cuda {
       }
 
       p.srcPtr = src.pitchedPtr();
-      p.kind = cudaMemcpyDeviceToHost;
-      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
+      p.kind = gpuMemcpyDeviceToHost;
+      WALBERLA_GPU_CHECK( gpuMemcpy3D( &p ) )
 
    }
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..c11953e4f589834073c346c930a39ce227a53a78
--- /dev/null
+++ b/src/gpu/FieldIndexing.h
@@ -0,0 +1,93 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldIndexing.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "stencil/Directions.h"
+
+#include "FieldAccessor.h"
+
+namespace walberla
+{
+namespace cell
+{
+class CellInterval;
+}
+} // namespace walberla
+
+namespace walberla
+{
+namespace gpu
+{
+
+// Forward Declarations
+template< typename T >
+class GPUField;
+
+template< typename T >
+class FieldIndexing
+{
+ public:
+   //** Kernel call        ******************************************************************************************
+   /*! \name Kernel call  */
+   //@{
+   dim3 blockDim() const { return blockDim_; }
+   dim3 gridDim() const { return gridDim_; }
+
+   const FieldAccessor< T >& gpuAccess() const { return gpuAccess_; }
+   //@}
+   //****************************************************************************************************************
+
+   //** Creation        *********************************************************************************************
+   /*! \name Creation  */
+   //@{
+   static FieldIndexing< T > interval(const GPUField< T >& f, const cell::CellInterval& ci, int fBegin = 0,
+                                      int fEnd = 1);
+
+   static FieldIndexing< T > xyz(const GPUField< T >& f);
+   static FieldIndexing< T > withGhostLayerXYZ(const GPUField< T >& f, uint_t numGhostLayers);
+   static FieldIndexing< T > ghostLayerOnlyXYZ(const GPUField< T >& f, uint_t thickness, stencil::Direction dir,
+                                               bool fullSlice = false);
+   static FieldIndexing< T > sliceBeforeGhostLayerXYZ(const GPUField< T >& f, uint_t thickness, stencil::Direction dir,
+                                                      bool fullSlice = false);
+   static FieldIndexing< T > sliceXYZ(const GPUField< T >& f, cell_idx_t distance, uint_t thickness,
+                                      stencil::Direction dir, bool fullSlice = false);
+
+   static FieldIndexing< T > allInner(const GPUField< T >& f);
+   static FieldIndexing< T > allWithGhostLayer(const GPUField< T >& f);
+   static FieldIndexing< T > all(const GPUField< T >& f, const cell::CellInterval& ci);
+   //@}
+   //****************************************************************************************************************
+
+ protected:
+   FieldIndexing(const GPUField< T >& field, dim3 _blockDim, dim3 _gridDim, const FieldAccessor< T > _gpuAccess);
+
+   const GPUField< T >& field_;
+   dim3 blockDim_;
+   dim3 gridDim_;
+   FieldAccessor< T > gpuAccess_;
+};
+
+} // namespace gpu
+} // namespace walberla
+
+#include "FieldIndexing.impl.h"
diff --git a/src/cuda/FieldIndexing.impl.h b/src/gpu/FieldIndexing.impl.h
similarity index 95%
rename from src/cuda/FieldIndexing.impl.h
rename to src/gpu/FieldIndexing.impl.h
index 7acafcdbbb9e21bf81d9bd13511d2f348f56fc4d..922a48b9b8cb6347c025db79a496c972d0e62377 100644
--- a/src/cuda/FieldIndexing.impl.h
+++ b/src/gpu/FieldIndexing.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -26,13 +26,12 @@
 #include "core/debug/Debug.h"
 #include "field/Layout.h"
 
-#include <cuda_runtime.h>
-
 #include <limits>
 #include <cmath>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 template< typename T>
 FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
@@ -45,17 +44,17 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
+      gpuDeviceProp prop;
       int count;
-      cudaGetDeviceCount(&count);
+      gpuGetDeviceCount(&count);
       int threadsPerBlock = std::numeric_limits<int>::max();
       for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
+         gpuGetDeviceProperties(&prop, i);
          threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
       }
       WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
                             "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
+                            "than the maximal thread count per GPU block." )
    }
 }
 
@@ -91,7 +90,10 @@ void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::Ind
 template< typename T>
 FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd )
 {
-   uint_t xOffset, yOffset, zOffset, fOffset;
+   uint_t xOffset;
+   uint_t yOffset;
+   uint_t zOffset;
+   uint_t fOffset;
 
    if ( f.layout() == field::zyxf )
    {
@@ -222,7 +224,7 @@ FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::Cell
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexing3D.h b/src/gpu/FieldIndexing3D.h
similarity index 98%
rename from src/cuda/FieldIndexing3D.h
rename to src/gpu/FieldIndexing3D.h
index ba93f83c6148de01e22c8d8284a441228c98e8be..dc6776cbbd3f7ea2824e8200aaf20fc8fe165eb6 100644
--- a/src/cuda/FieldIndexing3D.h
+++ b/src/gpu/FieldIndexing3D.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing3D.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
 //
@@ -23,14 +23,13 @@
 #pragma once
 
 #include "FieldAccessor3D.h"
-
 #include "stencil/Directions.h"
-#include <cuda_runtime.h>
 
 namespace walberla { namespace cell {  class CellInterval;  } }
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    // Forward Declarations
    template< typename T> class GPUField;
@@ -99,7 +98,7 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexing3D.impl.h b/src/gpu/FieldIndexing3D.impl.h
similarity index 90%
rename from src/cuda/FieldIndexing3D.impl.h
rename to src/gpu/FieldIndexing3D.impl.h
index 8a3a0f230388f41fcef091af4b4eef9013de2f39..a8cc922cfc6d4ed1975181e7f7a89302b39dbd6d 100644
--- a/src/cuda/FieldIndexing3D.impl.h
+++ b/src/gpu/FieldIndexing3D.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing3D.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -27,13 +27,12 @@
 #include "core/logging/Logging.h"
 #include "field/Layout.h"
 
-#include <cuda_runtime.h>
-
 #include <limits>
 #include <cmath>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 // Returns ( a % b != 0 ) ? ( a / b + 1 ) : ( a / b )
 inline unsigned int iDivUp( unsigned int a, unsigned int b ) { return ( a + b - 1 ) / b; }
@@ -53,26 +52,29 @@ FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
+      gpuDeviceProp prop;
       int count;
-      cudaGetDeviceCount(&count);
+      gpuGetDeviceCount(&count);
       int threadsPerBlock = std::numeric_limits<int>::max();
       for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
+         gpuGetDeviceProperties(&prop, i);
          threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
       }
       WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
                             "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
+                            "than the maximal thread count per GPU block." )
    }
 }
 
 template< typename T>
 FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const CellInterval & ci )
 {
-   uint_t xOffset, yOffset, zOffset, fOffset;
+   uint_t xOffset;
+   uint_t yOffset;
+   uint_t zOffset;
+   uint_t fOffset;
 
-   WALBERLA_ASSERT( f.layout() == field::fzyx );
+   WALBERLA_ASSERT( f.layout() == field::fzyx )
 
    xOffset = sizeof(T);
    yOffset = f.pitchedPtr().pitch;
@@ -89,9 +91,9 @@ FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const Ce
 
 
    dim3 idxDim( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
-   unsigned int bx = std::min( preferredBlockDim_.x, idxDim.x );
-   unsigned int by = std::min( preferredBlockDim_.y, idxDim.y );
-   unsigned int bz = std::min( preferredBlockDim_.z, idxDim.z );
+   unsigned int const bx = std::min( preferredBlockDim_.x, idxDim.x );
+   unsigned int const by = std::min( preferredBlockDim_.y, idxDim.y );
+   unsigned int const bz = std::min( preferredBlockDim_.z, idxDim.z );
    dim3 gridDim( iDivUp( idxDim.x, bx ),
                  iDivUp( idxDim.y, by ),
                  iDivUp( idxDim.z, bz ) );
@@ -160,7 +162,7 @@ FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexingXYZ.h b/src/gpu/FieldIndexingXYZ.h
similarity index 97%
rename from src/cuda/FieldIndexingXYZ.h
rename to src/gpu/FieldIndexingXYZ.h
index 18a6e2645b15ad296c984674ee565b3ad7e2eb89..b6da50d5d160bd37ca266d3f849e096f2383130a 100644
--- a/src/cuda/FieldIndexingXYZ.h
+++ b/src/gpu/FieldIndexingXYZ.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexingXYZ.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,14 +22,13 @@
 #pragma once
 
 #include "FieldAccessorXYZ.h"
-#include <cuda_runtime.h>
-
 
 namespace walberla { namespace cell {  class CellInterval;  } }
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 // Forward Declarations
 template< typename T> class GPUField;
@@ -73,7 +72,7 @@ template< typename T> class GPUField;
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexingXYZ.impl.h b/src/gpu/FieldIndexingXYZ.impl.h
similarity index 92%
rename from src/cuda/FieldIndexingXYZ.impl.h
rename to src/gpu/FieldIndexingXYZ.impl.h
index 6053e4ee8577e0687b4443f60a9bcfd1c4c1bf9c..9ec8b6c0852d4198c8f76b7380d6848dd361a49e 100644
--- a/src/cuda/FieldIndexingXYZ.impl.h
+++ b/src/gpu/FieldIndexingXYZ.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexingXYZ.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -26,7 +26,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template< typename T>
@@ -40,24 +41,27 @@ FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
+      gpuDeviceProp prop;
       int count;
-      cudaGetDeviceCount(&count);
+      gpuGetDeviceCount(&count);
       int threadsPerBlock = std::numeric_limits<int>::max();
       for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
+         gpuGetDeviceProperties(&prop, i);
          threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
       }
       WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
                             "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
+                            "than the maximal thread count per GPU block." )
    }
 }
 
 template< typename T>
 FieldIndexingXYZ<T> FieldIndexingXYZ<T>::interval ( const GPUField<T> & f, const CellInterval & ci )
 {
-   size_t xOffset, yOffset, zOffset, fOffset;
+   size_t xOffset;
+   size_t yOffset;
+   size_t zOffset;
+   size_t fOffset;
 
    if ( f.layout() == field::zyxf )
    {
@@ -114,7 +118,7 @@ FieldIndexingXYZ<T> FieldIndexingXYZ<T>::withGhostLayerXYZ( const GPUField<T> &
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/GPUCopy.cpp b/src/gpu/GPUCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7406048d916885cddd88abad3d6feff4ef1818f1
--- /dev/null
+++ b/src/gpu/GPUCopy.cpp
@@ -0,0 +1,397 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUCopy.cpp
+//! \ingroup gpu
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Copy routines of 4D intervals involving GPU buffers.
+//
+//======================================================================================================================
+
+#include "core/debug/Debug.h"
+
+#include "GPUCopy.h"
+
+#include <cstring>
+
+
+namespace walberla {
+namespace gpu
+{
+
+void copyDevToDevFZYX( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
+
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyDeviceToDevice;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using hipMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+
+void copyDevToDevZYXF( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+      p.kind = gpuMemcpyDeviceToDevice;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyHostToDevFZYX( const gpuPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyHostToDevice;
+
+      if (copyStream == nullptr)
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+void copyHostToDevZYXF( const gpuPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+         gpuMemcpy3DParms p;
+         std::memset( &p, 0, sizeof(p) );
+
+         p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+         p.srcPtr = make_gpuPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
+
+         p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+         p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+         p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+         p.kind = gpuMemcpyHostToDevice;
+
+         if ( copyStream == nullptr )
+         {
+            WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+         }
+         else
+         {
+            // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+            WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+         }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostFZYX( unsigned char* dst, const gpuPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyDeviceToHost;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostZYXF( unsigned char* dst, const gpuPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_gpuPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
+
+      p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+
+      p.kind = gpuMemcpyDeviceToHost;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+} // namespace gpu
+} // namespace walberla
diff --git a/src/cuda/GPUCopy.h b/src/gpu/GPUCopy.h
similarity index 86%
rename from src/cuda/GPUCopy.h
rename to src/gpu/GPUCopy.h
index 775d705b384520bbecfae4f0a347f2e541731902..e74d04ddb65105b1a65ce80b013ea6b6ea1df430 100644
--- a/src/cuda/GPUCopy.h
+++ b/src/gpu/GPUCopy.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
@@ -25,12 +25,14 @@
 
 #include "core/DataTypes.h"
 
+#include "ErrorChecking.h"
+
 #include <tuple>
-#include <cuda_runtime.h>
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 //****************************************************************************************************************************
@@ -44,14 +46,14 @@ namespace cuda {
  * \param srcAllocSizeZ allocation size in z direction of the source buffer
  * \param typeSize      size of an f element
  * \param intervalSize  interval size
- * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ * \param copyStream    CUDA/HIP stream, if not NULL copy operations will be performed asynchronously
  *****************************************************************************************************************************/
-void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+void copyDevToDevFZYX( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream );
+                       gpuStream_t copyStream );
 
 //****************************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout.
@@ -64,58 +66,58 @@ void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
  * \param srcAllocSizeY allocation size in y direction of the source buffer
  * \param typeSize      size of an f element
  * \param intervalSize  interval size
- * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ * \param copyStream    CUDA/HIP stream, if not NULL copy operations will be performed asynchronously
  *****************************************************************************************************************************/
-void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+void copyDevToDevZYXF( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream );
+                       gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See \ref copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+void copyHostToDevFZYX( const gpuPitchedPtr& dst, unsigned char* src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. See \ref copyDevToDevZYXF() for
  * parameter information.
  *******************************************************************************************************************/
-void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+void copyHostToDevZYXF( const gpuPitchedPtr& dst, unsigned char* src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See \ref copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+void copyDevToHostFZYX( unsigned char* dst, const gpuPitchedPtr& src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. See \ref copyDevToDevZYXF() for
  * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+void copyDevToHostZYXF( unsigned char* dst, const gpuPitchedPtr& src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/GPUField.h b/src/gpu/GPUField.h
similarity index 90%
rename from src/cuda/GPUField.h
rename to src/gpu/GPUField.h
index 9b11ef62cd073249af65d244c5e7b7d9d289f73e..a286b8dca2cca9a49d2a93455e865744efe649b5 100755
--- a/src/cuda/GPUField.h
+++ b/src/gpu/GPUField.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUField.h
-//! \ingroup moduleName
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -23,15 +23,16 @@
 
 #include "core/DataTypes.h"
 #include "core/cell/CellInterval.h"
-#include "field/Layout.h"
-#include "stencil/Directions.h"
 
-#include <cuda_runtime.h>
+#include "field/Layout.h"
 
+#include "stencil/Directions.h"
 
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    using field::Layout;
    using field::fzyx;
@@ -41,18 +42,18 @@ namespace cuda {
    //*******************************************************************************************************************
    /*! GhostLayerField stored on a CUDA GPU
    *
-   *  Basically a wrapper around a CUDA device pointer together with size information about the field
+   *  Basically a wrapper around a CUDA/HIP device pointer together with size information about the field
    *  i.e. sizes in x,y,z,f directions and number of ghost layers.
    *
-   *  Internally represented by a cudaPitchedPtr which is allocated with cudaMalloc3D to take padding of the
+   *  Internally represented by a gpuPitchedPtr which is allocated with gpuMalloc3D to take padding of the
    *  innermost coordinate into account.
    *
    *  Supports Array-of-Structures (AoS,zyxf) layout and Structure-of-Arrays (SoA, fzyx) layout, in a similar way
    *  to field::Field
    *
-   *  To work with the GPUField look at the cuda::fieldCpy functions to transfer a field::Field to a cuda::GPUField
+   *  To work with the GPUField look at the gpu::fieldCpy functions to transfer a field::Field to a gpu::GPUField
    *  and vice versa.
-   *  When writing CUDA kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
+   *  When writing device kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
    *  These simplify the "iteration" i.e. indexing of cells in GPUFields.
    */
    //*******************************************************************************************************************
@@ -71,7 +72,7 @@ namespace cuda {
 
       bool isPitchedMem() const { return usePitchedMem_; }
 
-      cudaPitchedPtr pitchedPtr() const { return pitchedPtr_; }
+      gpuPitchedPtr pitchedPtr() const { return pitchedPtr_; }
 
 
       inline uint_t  xSize() const  { return xSize_; }
@@ -140,7 +141,7 @@ namespace cuda {
       const T * dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f) const;
 
    protected:
-      cudaPitchedPtr pitchedPtr_;
+      gpuPitchedPtr pitchedPtr_;
       uint_t         nrOfGhostLayers_;
       uint_t         xSize_;
       uint_t         ySize_;
@@ -154,7 +155,7 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/GPUField.impl.h b/src/gpu/GPUField.impl.h
similarity index 88%
rename from src/cuda/GPUField.impl.h
rename to src/gpu/GPUField.impl.h
index 5dd3e58409c95e4163e0b086832f69ab11cac136..221440f5c953485f7bf45b30730980d36837cf3a 100644
--- a/src/cuda/GPUField.impl.h
+++ b/src/gpu/GPUField.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUField.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -25,7 +25,8 @@
 #include "core/logging/Logging.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template<typename T>
@@ -35,7 +36,7 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
      xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ),
      layout_( _layout ), usePitchedMem_( usePitchedMem )
 {
-   cudaExtent extent;
+   gpuExtent extent;
    if ( layout_ == zyxf )
    {
       extent.width  = _fSize * sizeof(T);
@@ -55,19 +56,18 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
       const size_t alignment = 256;
       void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
                                                  sizeof(T) * nrOfGhostLayers_ );
-      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
-      pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
+      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 )
+      pitchedPtr_ = make_gpuPitchedPtr( mem, pitch, extent.width, extent.height );
    }
    else
    {
-      pitchedPtr_ = make_cudaPitchedPtr( NULL, extent.width, extent.width, extent.height );
-      WALBERLA_CUDA_CHECK ( cudaMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) );
+      pitchedPtr_ = make_gpuPitchedPtr( nullptr, extent.width, extent.width, extent.height );
+      WALBERLA_GPU_CHECK ( gpuMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) )
    }
 
-   // allocation size is stored in pitched pointer
-   // pitched pointer stores the amount of padded region in bytes
-   // but we keep track of the size in #elements
-   WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 );
+   // allocation size is stored in pitched pointer which stores the amount of padded region in bytes
+   // we keep track of the size in #elements
+   WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 )
    if ( layout_ == field::fzyx )
    {
       xAllocSize_ = pitchedPtr_.pitch / sizeof(T);
@@ -88,7 +88,7 @@ GPUField<T>::~GPUField()
       free_aligned_with_offset(pitchedPtr_.ptr );
    else
    {
-      WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
+      WALBERLA_GPU_CHECK( gpuFree( pitchedPtr_.ptr ) )
    }
 }
 
@@ -122,8 +122,8 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci,
                                    cell_idx_c( ySize() ),
                                    cell_idx_c( zSize() )};
 
-   WALBERLA_ASSERT_GREATER( thickness, 0 );
-   WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), nrOfGhostLayers() );
+   WALBERLA_ASSERT_GREATER( thickness, 0 )
+   WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), nrOfGhostLayers() )
    const cell_idx_t ghosts = cell_idx_c ( thickness );
 
    cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( nrOfGhostLayers() ) : 0;
@@ -162,7 +162,7 @@ template<typename T>
 void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci,
                            cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const
 {
-   WALBERLA_ASSERT_GREATER( thickness, 0 );
+   WALBERLA_ASSERT_GREATER( thickness, 0 )
 
    const cell_idx_t sizeArr [] = { cell_idx_c( xSize() ),
                                    cell_idx_c( ySize() ),
@@ -197,7 +197,7 @@ inline uint_t GPUField<T>::size( uint_t coord ) const
       case 1: return this->ySize();
       case 2: return this->zSize();
       case 3: return this->fSize();
-      default: WALBERLA_ASSERT(false); return 0;
+      default: WALBERLA_ASSERT(false) return 0;
    }
 }
 
@@ -227,7 +227,7 @@ bool GPUField<T>::hasSameAllocSize( const GPUField<T> & other ) const
 //*******************************************************************************************************************
 /*! Creates a new GPUField that has equal size, layout and memory type as this field but has uninitialized memory.
  *
- * \return a new FPUField that has to be freed by caller.
+ * \return a new GPUField that has to be freed by caller.
  *******************************************************************************************************************/
 template <typename T>
 GPUField<T> * GPUField<T>::cloneUninitialized() const
@@ -235,10 +235,10 @@ GPUField<T> * GPUField<T>::cloneUninitialized() const
    GPUField<T> * res = new GPUField<T>( xSize(), ySize(), zSize(), fSize(),
                                         nrOfGhostLayers(), layout(), isPitchedMem() );
 
-   WALBERLA_ASSERT( hasSameAllocSize( *res ) );
-   WALBERLA_ASSERT( hasSameSize( *res ) );
-   WALBERLA_ASSERT( layout() == res->layout() );
-   WALBERLA_ASSERT( isPitchedMem() == res->isPitchedMem() );
+   WALBERLA_ASSERT( hasSameAllocSize( *res ) )
+   WALBERLA_ASSERT( hasSameSize( *res ) )
+   WALBERLA_ASSERT( layout() == res->layout() )
+   WALBERLA_ASSERT( isPitchedMem() == res->isPitchedMem() )
    return res;
 }
 
@@ -292,16 +292,16 @@ uint_t GPUField<T>::fAllocSize() const
 template<typename T>
 void GPUField<T>::swapDataPointers( GPUField<T> & other )
 {
-   WALBERLA_ASSERT( hasSameAllocSize( other ) );
-   WALBERLA_ASSERT( hasSameSize( other ) );
-   WALBERLA_ASSERT( layout() == other.layout() );
-   WALBERLA_ASSERT( isPitchedMem() == other.isPitchedMem() );
+   WALBERLA_ASSERT( hasSameAllocSize( other ) )
+   WALBERLA_ASSERT( hasSameSize( other ) )
+   WALBERLA_ASSERT( layout() == other.layout() )
+   WALBERLA_ASSERT( isPitchedMem() == other.isPitchedMem() )
    std::swap( pitchedPtr_, other.pitchedPtr_ );
 }
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/GPUWrapper.h b/src/gpu/GPUWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1abbc22895d7b9d6284c0a2661bcb14ca1421aeb
--- /dev/null
+++ b/src/gpu/GPUWrapper.h
@@ -0,0 +1,135 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUWrapper.h
+//! \ingroup gpu
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+// https://rocmdocs.amd.com/en/latest/Programming_Guides/CUDAAPIHIPTEXTURE.html
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+    #include <cuda_runtime.h>
+
+
+    using gpuError_t = cudaError_t;
+    #define gpuSuccess cudaSuccess
+    #define gpuGetErrorName cudaGetErrorName
+    #define gpuGetErrorString cudaGetErrorString
+    #define gpuPeekAtLastError cudaPeekAtLastError
+
+    #define gpuMalloc cudaMalloc
+    #define gpuMallocHost cudaMallocHost
+    #define gpuHostAllocDefault cudaHostAllocDefault
+    #define gpuHostAlloc cudaHostAlloc
+    #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+    #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+    #define gpuMemcpy cudaMemcpy
+    #define gpuMemcpyAsync cudaMemcpyAsync
+    #define gpuMemcpy3D cudaMemcpy3D
+    #define gpuMemcpy3DParms cudaMemcpy3DParms
+    #define gpuMemcpy3DAsync cudaMemcpy3DAsync
+
+    #define make_gpuPos make_cudaPos
+    #define make_gpuPitchedPtr make_cudaPitchedPtr
+    #define gpuPitchedPtr cudaPitchedPtr
+    #define make_gpuExtent make_cudaExtent
+    using gpuExtent = cudaExtent;
+
+    #define gpuFree cudaFree
+    #define gpuFreeHost cudaFreeHost
+
+    using gpuStream_t = cudaStream_t;
+    #define gpuStreamDestroy cudaStreamDestroy
+    #define gpuStreamCreateWithPriority cudaStreamCreateWithPriority
+    #define gpuDeviceGetStreamPriorityRange cudaDeviceGetStreamPriorityRange
+    #define gpuStreamCreate cudaStreamCreate
+    #define gpuStreamSynchronize cudaStreamSynchronize
+    #define gpuDeviceSynchronize cudaDeviceSynchronize
+
+    using gpuEvent_t = cudaEvent_t;
+    #define gpuEventCreate cudaEventCreate
+    #define gpuEventRecord cudaEventRecord
+    #define gpuEventDestroy cudaEventDestroy
+    #define gpuStreamWaitEvent cudaStreamWaitEvent
+    #define gpuStreamDefault cudaStreamDefault
+
+    #define gpuGetDeviceCount cudaGetDeviceCount
+    #define gpuSetDevice cudaSetDevice
+    #define gpuDeviceProp cudaDeviceProp
+    #define gpuGetDeviceProperties cudaGetDeviceProperties
+
+    #define gpuLaunchKernel cudaLaunchKernel
+#endif
+
+
+#ifdef WALBERLA_BUILD_WITH_HIP
+    #include <hip/hip_runtime.h>
+
+
+    using gpuError_t = hipError_t;
+    #define gpuSuccess hipSuccess
+    #define gpuGetErrorName hipGetErrorName
+    #define gpuGetErrorString hipGetErrorString
+    #define gpuPeekAtLastError hipPeekAtLastError
+
+    #define gpuMalloc hipMalloc
+    #define gpuMallocHost hipHostMalloc
+    #define gpuHostAllocDefault hipHostMallocDefault
+    // warning: 'hipHostAlloc' is deprecated: use hipHostMalloc insteadwarning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
+    #define gpuHostAlloc hipHostMalloc
+    #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+    #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+    #define gpuMemcpy hipMemcpy
+    #define gpuMemcpyAsync hipMemcpyAsync
+    #define gpuMemcpy3D hipMemcpy3D
+    #define gpuMemcpy3DParms hipMemcpy3DParms
+    #define gpuMemcpy3DAsync hipMemcpy3DAsync
+
+    #define make_gpuPitchedPtr make_hipPitchedPtr
+    #define make_gpuPos make_hipPos
+    using gpuPitchedPtr = hipPitchedPtr;
+    #define make_gpuExtent make_hipExtent
+    using gpuExtent = hipExtent;
+
+    #define gpuFree hipFree
+    #define gpuFreeHost hipHostFree
+
+    using gpuStream_t = hipStream_t;
+    #define gpuStreamDestroy hipStreamDestroy
+    #define gpuStreamCreateWithPriority hipStreamCreateWithPriority
+    #define gpuDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange
+    #define gpuStreamCreate hipStreamCreate
+    #define gpuStreamSynchronize hipStreamSynchronize
+    #define gpuDeviceSynchronize hipDeviceSynchronize
+
+    using gpuEvent_t = hipEvent_t;
+    #define gpuEventCreate hipEventCreate
+    #define gpuEventRecord hipEventRecord
+    #define gpuEventDestroy hipEventDestroy
+    #define gpuStreamWaitEvent hipStreamWaitEvent
+    #define gpuStreamDefault hipStreamDefault
+
+    #define gpuGetDeviceCount hipGetDeviceCount
+    #define gpuSetDevice hipSetDevice
+    #define gpuDeviceProp hipDeviceProp
+    #define gpuGetDeviceProperties hipGetDeviceProperties
+
+    #define gpuLaunchKernel hipLaunchKernel
+#endif
diff --git a/src/cuda/HostFieldAllocator.h b/src/gpu/HostFieldAllocator.h
similarity index 72%
rename from src/cuda/HostFieldAllocator.h
rename to src/gpu/HostFieldAllocator.h
index 7276c495db75933112a1af697c3181274fd7b450..98892aebadc7fd1c4b03ffc38f16bd26631e4123 100644
--- a/src/cuda/HostFieldAllocator.h
+++ b/src/gpu/HostFieldAllocator.h
@@ -14,9 +14,9 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file HostFieldAllocator.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
-//! \brief Allocator that allocates a CPU! field using cudaHostAlloc
+//! \brief Allocator that allocates a CPU! field using gpuHostAlloc
 //
 //======================================================================================================================
 
@@ -25,29 +25,28 @@
 #include "ErrorChecking.h"
 #include "field/allocation/FieldAllocator.h"
 
-#include <cuda_runtime.h>
-
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    //*******************************************************************************************************************
    /*!
-   * Allocator that allocates a CPU! field using cudaHostAlloc without padding
+   * Allocator that allocates a CPU! field using gpuHostAlloc without padding
    *
-   * Uses cudaHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU
+   * Uses gpuHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU
    * This allocator should be used for CPU fields that are often transfered to GPU and back
    *
-   * \ingroup cuda
+   * \ingroup gpu
    *
    */
    //*******************************************************************************************************************
-   template<typename T, unsigned int cudaHostAllocFlags = cudaHostAllocDefault>
+   template<typename T, unsigned int HostAllocFlags = gpuHostAllocDefault>
    class HostFieldAllocator : public field::FieldAllocator<T>
    {
    public:
-      virtual ~HostFieldAllocator() {}
+      virtual ~HostFieldAllocator() = default;
 
       virtual T * allocateMemory (  uint_t size0, uint_t size1, uint_t size2, uint_t size3,
                                     uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 )
@@ -56,26 +55,24 @@ namespace cuda {
          allocSize2=size2;
          allocSize3=size3;
          void * result;
-         WALBERLA_CUDA_CHECK( cudaHostAlloc( &result, size0*size1*size2*size3*sizeof(T), cudaHostAllocFlags ) );
+         WALBERLA_GPU_CHECK( gpuHostAlloc( &result, size0*size1*size2*size3*sizeof(T), HostAllocFlags ) )
          return (T*)(result);
       }
 
       virtual T * allocateMemory ( uint_t size )
       {
          T* result;
-         cudaHostAlloc( &result, size*sizeof(T), cudaHostAllocFlags );
+         gpuHostAlloc( &result, size*sizeof(T), HostAllocFlags );
          return result;
       }
 
-      virtual void deallocate(T *& values) {
-         WALBERLA_CUDA_CHECK( cudaFreeHost( values ) );
-      }
+      virtual void deallocate(T *& values) {WALBERLA_GPU_CHECK( gpuFreeHost( values ) )}
    };
 
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/Kernel.h b/src/gpu/Kernel.h
similarity index 84%
rename from src/cuda/Kernel.h
rename to src/gpu/Kernel.h
index cb69aa4fb238200c1bd332b9f2fbc2702f19eb97..6fca210529a650545632f465f80da00928c25f46 100644
--- a/src/cuda/Kernel.h
+++ b/src/gpu/Kernel.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file Kernel.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -25,20 +25,20 @@
 #include "core/debug/Debug.h"
 #include "core/FunctionTraits.h"
 
+#include "gpu/GPUWrapper.h"
 #include "ErrorChecking.h"
 
-#include <cuda_runtime.h>
 #include <type_traits>
 #include <vector>
 
 
-
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    //*******************************************************************************************************************
-   /*! Wrapper class around a CUDA kernel, to call kernels also from code not compiled with nvcc
+   /*! Wrapper class around a GPU kernel, to call kernels also from code not compiled with the device compiler
    *
    * Example:
    * \code
@@ -55,10 +55,8 @@ namespace cuda {
    * \endcode
    *
    * Why use this strange wrapper class instead of the nice kernel call syntax "<<<griddim, blockdim >>>" ??
-   *     - This syntax is nice but has to be compiled with nvcc, which does not (yet) understand C++11
-   *     - C++11 features are used all over the place in waLBerla code
-   *     - all *.cu files and headers included in *.cu files have to be "C++11 free"
-   *     - thus there should be as few code as possible in *.cu files
+   *     - This syntax is nice but has to be compiled with the device compiler
+   *     - The wrapper allows to compile the kernel call with the host compiler
    *
    * Drawbacks of this class compared to kernel call syntax:
    * Type checking of parameters can only be done at runtime (is done only in Debug mode!).
@@ -75,17 +73,17 @@ namespace cuda {
          // this code is equivalent to:
          kernel_func<<< dim3( 3,3,3), dim3( 4,4,4) >> ( argument1, 20 );
    * \endcode
-   * The parameter types of the kernel and the parameters added at the cuda::Kernel class do not match.
+   * The parameter types of the kernel and the parameters added at the gpu::Kernel class do not match.
    * This is only detected when the code is run and was compiled in DEBUG mode!
    *
    *
    * Advantages of this class compared to kernel call syntax: Integrates nicely with waLBerlas field indexing and
    * accessor concepts:
    * \code
-         void kernel_func( cuda::SimpleFieldAccessor<double> f );
+         void kernel_func( gpu::SimpleFieldAccessor<double> f );
 
-         auto myKernel = cuda::make_kernel( &kernel_double );
-         myKernel.addFieldIndexingParam( cuda::SimpleFieldIndexing<double>::xyz( gpuField ) );
+         auto myKernel = gpu::make_kernel( &kernel_double );
+         myKernel.addFieldIndexingParam( gpu::SimpleFieldIndexing<double>::xyz( gpuField ) );
          myKernel();
    * \endcode
    * When using at least one FieldIndexingParameter configure() does not have to be called, since the thread and grid
@@ -104,7 +102,7 @@ namespace cuda {
 
 
       void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
-      void operator() ( cudaStream_t stream = 0 ) const;
+      void operator() ( gpuStream_t stream = nullptr ) const;
 
 
    protected:
@@ -113,10 +111,10 @@ namespace cuda {
       //@{
       FuncPtr funcPtr_;
 
-      bool configured_;
+      bool configured_{ false };
       dim3 gridDim_;
       dim3 blockDim_;
-      std::size_t sharedMemSize_;
+      std::size_t sharedMemSize_{ 0 };
 
       std::vector< std::vector<char> > params_;
       //@}
@@ -175,10 +173,7 @@ namespace cuda {
 
    template<typename FP>
    Kernel<FP>::Kernel( FP funcPtr )
-      : funcPtr_ ( funcPtr ),
-        configured_( false ),
-        sharedMemSize_( 0 )
-   {}
+      : funcPtr_ ( funcPtr ) {}
 
    template<typename FP>
    template<typename T>
@@ -189,7 +184,7 @@ namespace cuda {
       std::memcpy ( paramInfo.data(), &param, sizeof(T) );
 
       WALBERLA_ASSERT( checkParameter<T>( params_.size() ),
-                       "cuda::Kernel type mismatch of parameter " << params_.size()  );
+                       "gpu::Kernel type mismatch of parameter " << params_.size()  )
 
       params_.push_back( paramInfo );
    }
@@ -218,18 +213,18 @@ namespace cuda {
          if ( gridDim.x  != gridDim_.x  || gridDim.y != gridDim_.y   || gridDim.z != gridDim_.z ||
               blockDim.x != blockDim_.x || blockDim.y != blockDim_.y || blockDim.z != blockDim_.z  )
          {
-            WALBERLA_ABORT( "Error when configuring cuda::Kernel: Inconsistent setup. " );
+            WALBERLA_ABORT( "Error when configuring gpu::Kernel: Inconsistent setup. " )
          }
       }
    }
 
    template<typename FP>
-   void Kernel<FP>::operator() ( cudaStream_t stream ) const
+   void Kernel<FP>::operator() ( gpuStream_t stream ) const
    {
       // check for correct number of parameter calls
       if ( params_.size() != FunctionTraits<FuncType>::arity ) {
-         WALBERLA_ABORT( "Error when calling cuda::Kernel - Wrong number of arguments. " <<
-                         "Expected " << FunctionTraits<FuncType>::arity << ", received " << params_.size() );
+         WALBERLA_ABORT( "Error when calling gpu::Kernel - Wrong number of arguments. " <<
+                         "Expected " << FunctionTraits<FuncType>::arity << ", received " << params_.size() )
       }
 
       // register all parameters
@@ -241,7 +236,7 @@ namespace cuda {
       // .. and launch the kernel
       static_assert( sizeof(void *) == sizeof(void (*)(void)),
                      "object pointer and function pointer sizes must be equal" );
-      WALBERLA_CUDA_CHECK( cudaLaunchKernel( (void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream ) );
+      WALBERLA_GPU_CHECK( gpuLaunchKernel( (void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream ) )
    }
 
 
@@ -259,7 +254,7 @@ namespace cuda {
          case 6: return checkParameter6<T>();
          case 7: return checkParameter7<T>();
          default:
-            WALBERLA_ABORT("Too many parameters passed to kernel");
+            WALBERLA_ABORT("Too many parameters passed to kernel")
       }
       return false;
    }
@@ -267,5 +262,5 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/NVTX.h b/src/gpu/NVTX.h
similarity index 97%
rename from src/cuda/NVTX.h
rename to src/gpu/NVTX.h
index a8c1210b827b89d28f5d1491a84adaa9f020432e..46302f917a0af13aba64b1b7802dc8bf4ec97e9f 100644
--- a/src/cuda/NVTX.h
+++ b/src/gpu/NVTX.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file NVTX.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -28,7 +28,8 @@
 #include <nvToolsExtCudaRt.h>
 
 namespace walberla{
-namespace cuda {
+namespace gpu
+{
 
 inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa)
 {
@@ -71,5 +72,5 @@ private:
 };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ParallelStreams.cpp b/src/gpu/ParallelStreams.cpp
similarity index 67%
rename from src/cuda/ParallelStreams.cpp
rename to src/gpu/ParallelStreams.cpp
index d2fff04161673bbb720d359d89efd63a31d031b1..2dffc7f0aa431d2e06bf7a480ca964a1e733962f 100644
--- a/src/cuda/ParallelStreams.cpp
+++ b/src/gpu/ParallelStreams.cpp
@@ -14,47 +14,48 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file ParallelStreams.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
-   ParallelSection::ParallelSection(ParallelStreams * parent, cudaStream_t mainStream)
+   ParallelSection::ParallelSection(ParallelStreams * parent, gpuStream_t mainStream)
      : parent_( parent ), mainStream_( mainStream ), counter_( 0 )
    {
-      WALBERLA_CUDA_CHECK( cudaEventCreate(&startEvent_) );
-      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+      WALBERLA_GPU_CHECK( gpuEventCreate(&startEvent_) )
+      WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) )
    }
 
    ParallelSection::~ParallelSection()
    {
       synchronize();
-      WALBERLA_CUDA_CHECK( cudaEventDestroy(startEvent_) );
+      WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) )
    }
 
    void ParallelSection::next()
    {
       if( counter_ > 0 ) {
-         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) );
+         WALBERLA_GPU_CHECK( gpuEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) )
       }
       else {
-         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->mainEvent_, mainStream_ ) );
+         WALBERLA_GPU_CHECK( gpuEventRecord( parent_->mainEvent_, mainStream_ ) )
       }
       ++counter_;
 
       parent_->ensureSize( counter_ );
 
-      WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream(), startEvent_, 0 ));
+      WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream(), startEvent_, 0 ))
    }
 
-   void ParallelSection::run(const std::function<void( cudaStream_t)> & f)
+   void ParallelSection::run(const std::function<void( gpuStream_t)> & f)
    {
       f( stream() );
       next();
@@ -69,14 +70,14 @@ namespace cuda {
                continue;
 
             auto & event  = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
-            cudaStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
-            WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream, event, 0 ));
+            gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
+            WALBERLA_GPU_CHECK( gpuStreamWaitEvent( stream, event, 0 ))
          }
 
-      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+      WALBERLA_GPU_CHECK( gpuEventRecord( startEvent_, mainStream_ ) )
    }
 
-   cudaStream_t ParallelSection::stream()
+   gpuStream_t ParallelSection::stream()
    {
       return counter_ == 0 ? mainStream_ : parent_->sideStreams_[counter_ - 1];
    }
@@ -88,7 +89,7 @@ namespace cuda {
    {
    }
 
-   ParallelSection ParallelStreams::parallelSection( cudaStream_t stream ) {
+   ParallelSection ParallelStreams::parallelSection( gpuStream_t stream ) {
       return ParallelSection(this, stream);
    }
 
@@ -96,7 +97,7 @@ namespace cuda {
       for( uint_t i = sideStreams_.size(); i < size; ++i )
       {
          sideStreams_.emplace_back( StreamRAII::newPriorityStream(streamPriority_));
-         events_.emplace_back( EventRAII() );
+         events_.emplace_back( );
       }
    }
 
@@ -109,5 +110,5 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ParallelStreams.h b/src/gpu/ParallelStreams.h
similarity index 77%
rename from src/cuda/ParallelStreams.h
rename to src/gpu/ParallelStreams.h
index 4116e0ef971ccc4d08209e9f0a20fc2ced3878c9..fd83932766abfe6d0d177a2a09995a4d98f3ff77 100644
--- a/src/cuda/ParallelStreams.h
+++ b/src/gpu/ParallelStreams.h
@@ -14,18 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file ParallelStreams.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 #pragma once
-#include "cuda/ErrorChecking.h"
-#include "cuda/CudaRAII.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/CudaRAII.h"
 
 #include <vector>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    class ParallelStreams;
 
@@ -33,32 +34,32 @@ namespace cuda {
    {
    public:
       ~ParallelSection();
-      void run( const std::function<void( cudaStream_t )> &f );
+      void run( const std::function<void( gpuStream_t )> &f );
 
-      cudaStream_t stream();
+      gpuStream_t stream();
       void next();
 
    private:
       friend class ParallelStreams;
 
-      ParallelSection( ParallelStreams *parent, cudaStream_t mainStream );
+      ParallelSection( ParallelStreams *parent, gpuStream_t mainStream );
       void synchronize();
 
       ParallelStreams * parent_;
-      cudaStream_t mainStream_;
-      cudaEvent_t startEvent_;
+      gpuStream_t mainStream_;
+      gpuEvent_t startEvent_;
       uint_t counter_;
    };
 
 
    //*******************************************************************************************************************
    /*!
-    * Helper class to run CUDA operations on parallel streams
+    * Helper class to run CUDA/HIP operations on parallel streams
     *
     * This class introduces "side streams" that overlap with one "main stream". In a parallel section, multiple
-    * kernels (or other CUDA operations) are scheduled to the streams. The first "run" is scheduled on the main stream
+    * kernels (or other CUDA/HIP operations) are scheduled to the streams. The first "run" is scheduled on the main stream
     * all subsequent operations on the side streams. The passed priority affects only the side streams. When
-    * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA events.
+    * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA/HIP events.
     *
     * Example:
     *
@@ -66,8 +67,8 @@ namespace cuda {
     * ParallelStreams streams;
     * {
     *   // new scope for the parallel section
-    *   ParallelSection sec = streams.parallelSection( mainCudaStream );
-    *   sec.run([&] ( cudaStream_t sideStream ) {
+    *   ParallelSection sec = streams.parallelSection( mainGPUStream );
+    *   sec.run([&] ( gpuStream_t sideStream ) {
     *       // run something on the side stream
     *   });
     *   // after the parallel section goes out of scope the side streams are synchronized to the main stream
@@ -81,7 +82,7 @@ namespace cuda {
    {
    public:
       ParallelStreams( int priority = 0 );
-      ParallelSection parallelSection( cudaStream_t stream );
+      ParallelSection parallelSection( gpuStream_t stream );
       void setStreamPriority( int priority );
 
    private:
@@ -96,5 +97,5 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/CMakeLists.txt b/src/gpu/communication/CMakeLists.txt
similarity index 91%
rename from src/cuda/communication/CMakeLists.txt
rename to src/gpu/communication/CMakeLists.txt
index b1fe9c3492eb1e60040469cb1ada41559c1121dd..98bbff2016d5b9d3dd6c26de2774888e1e0cc257 100644
--- a/src/cuda/communication/CMakeLists.txt
+++ b/src/gpu/communication/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     MemcpyPackInfo.h
     UniformGPUScheme.impl.h
diff --git a/src/cuda/communication/CustomMemoryBuffer.h b/src/gpu/communication/CustomMemoryBuffer.h
similarity index 91%
rename from src/cuda/communication/CustomMemoryBuffer.h
rename to src/gpu/communication/CustomMemoryBuffer.h
index 2caab2a41b13f5fb88c6e3052312e742778db408..26a6743f3ef4aacb436a5e08ceff69ed68241ef0 100644
--- a/src/cuda/communication/CustomMemoryBuffer.h
+++ b/src/gpu/communication/CustomMemoryBuffer.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file CustomMemoryBuffer.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \brief Basic Buffer supporting different memory spaces
 //
@@ -22,14 +22,15 @@
 
 #pragma once
 
-#include "cuda/ErrorChecking.h"
-
 #include <algorithm>
 #include <cstring>
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -40,7 +41,7 @@ namespace communication {
    /*!
     * Simple buffer class that supports memory allocators, e.g. for pinned host memory or GPU memory
     *
-    * \ingroup cuda
+    * \ingroup gpu
     *
     * In contrast to core::mpi::Buffer this class does not support stream operators "<<" and ">>" because these
     * operators imply serial (un)packing which is not feasible on the GPU.
@@ -100,13 +101,13 @@ namespace communication {
       static void *allocate( size_t size )
       {
          void *p;
-         WALBERLA_CUDA_CHECK( cudaMallocHost( &p, size ))
+         WALBERLA_GPU_CHECK( gpuMallocHost( &p, size ))
          return p;
       }
 
       static void deallocate( void *ptr )
       {
-         WALBERLA_CUDA_CHECK( cudaFreeHost( ptr ))
+         WALBERLA_GPU_CHECK( gpuFreeHost( ptr ))
       }
 
       static void memcpy( void *dst, void *src, size_t count )
@@ -120,24 +121,24 @@ namespace communication {
       static void *allocate( size_t size )
       {
          void *p;
-         WALBERLA_CUDA_CHECK( cudaMalloc( &p, size ))
+         WALBERLA_GPU_CHECK( gpuMalloc( &p, size ))
          return p;
       }
 
       static void deallocate( void *ptr )
       {
-         WALBERLA_CUDA_CHECK( cudaFree( ptr ))
+         WALBERLA_GPU_CHECK( gpuFree( ptr ))
       }
 
       static void memcpy( void *dst, void *src, size_t count )
       {
-         cudaMemcpy( dst, src, count, cudaMemcpyDeviceToDevice );
+         WALBERLA_GPU_CHECK( gpuMemcpy( dst, src, count, gpuMemcpyDeviceToHost ) )
       }
    };
 
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 #include "CustomMemoryBuffer.impl.h"
diff --git a/src/cuda/communication/CustomMemoryBuffer.impl.h b/src/gpu/communication/CustomMemoryBuffer.impl.h
similarity index 98%
rename from src/cuda/communication/CustomMemoryBuffer.impl.h
rename to src/gpu/communication/CustomMemoryBuffer.impl.h
index 21d70e4ccceac05de50c8ceea67e09b780e9fa38..ea354be200fe307b50b9396e889b74c2b17b8819 100644
--- a/src/cuda/communication/CustomMemoryBuffer.impl.h
+++ b/src/gpu/communication/CustomMemoryBuffer.impl.h
@@ -14,14 +14,15 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file CustomMemoryBuffer.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -118,5 +119,5 @@ namespace communication {
    }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/communication/GPUPackInfo.h b/src/gpu/communication/GPUPackInfo.h
similarity index 90%
rename from src/cuda/communication/GPUPackInfo.h
rename to src/gpu/communication/GPUPackInfo.h
index 661029b40dc1f39e55a20c7a708d19bc89da4cfa..3922690445c7f11fb8375596a9f9f740c076727a 100644
--- a/src/cuda/communication/GPUPackInfo.h
+++ b/src/gpu/communication/GPUPackInfo.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUPackInfo.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //======================================================================================================================
@@ -22,32 +22,36 @@
 #pragma once
 
 #include "blockforest/Block.h"
+
 #include "communication/UniformPackInfo.h"
+
 #include "core/debug/Debug.h"
 #include "core/math/Vector3.h"
 #include "core/mpi/BufferSizeTrait.h"
+
 #include "field/GhostRegions.h"
 #include "field/Layout.h"
-#include "stencil/Directions.h"
 
-#include "cuda/ErrorChecking.h"
-#include "cuda/GPUCopy.h"
-#include "cuda/communication/CustomMemoryBuffer.h"
+#include "stencil/Directions.h"
 
-#include <cuda_runtime.h>
 #include <map>
-#include <vector>
 #include <tuple>
+#include <vector>
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPUCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
 /**
- * Data packing/unpacking for ghost layer based communication of a cuda::GPUField
- * \ingroup cuda
+ * Data packing/unpacking for ghost layer based communication of a gpu::GPUField
+ * \ingroup gpu
  * Template Parameters:
  *    - GPUField_T   A fully qualified GPUField.
  */
@@ -78,7 +82,7 @@ public:
 
    void communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir);
 
-   void setCommunicationStream( cudaStream_t stream )
+   void setCommunicationStream( gpuStream_t stream )
    {
       if ( stream != 0 )
       {
@@ -96,7 +100,7 @@ protected:
    bool   communicateAllGhostLayers_;
    uint_t numberOfGhostLayers_;
    bool copyAsync_;
-   cudaStream_t communicationStream_;
+   gpuStream_t communicationStream_;
    std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_;
    mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_;
 };
@@ -106,7 +110,7 @@ template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer)
 {
    GPUField_T * fieldPtr = receiver->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
@@ -126,7 +130,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
       std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
    }
 
-   cudaStream_t & unpackStream = communicationStream_;
+   gpuStream_t & unpackStream = communicationStream_;
 
    auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
                                      uint_c(fieldCi.yMin() + nrOfGhostLayers),
@@ -156,7 +160,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( unpackStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( unpackStream ) );
    }
 }
 
@@ -167,13 +171,13 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    const GPUField_T * sf = sender  ->getData< GPUField_T >( bdId_ );
          GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ );
 
-   WALBERLA_ASSERT_NOT_NULLPTR( sf );
-   WALBERLA_ASSERT_NOT_NULLPTR( rf );
+   WALBERLA_ASSERT_NOT_NULLPTR( sf )
+   WALBERLA_ASSERT_NOT_NULLPTR( rf )
 
-   WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize());
-   WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize());
-   WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize());
-   WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize());
+   WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize())
+   WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize())
+   WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize())
+   WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize())
 
    WALBERLA_CHECK( sf->layout() == rf->layout(), "GPUPackInfo::communicateLocal: fields must have the same layout!" );
 
@@ -182,7 +186,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false );
    CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false );
 
-   cudaStream_t & commStream = communicationStream_;
+   gpuStream_t & commStream = communicationStream_;
 
    auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers),
                                      uint_c(rCi.yMin() + nrOfGhostLayers),
@@ -217,7 +221,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( commStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( commStream ) )
    }
 }
 
@@ -226,7 +230,7 @@ template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const
 {
    const GPUField_T * fieldPtr = sender->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
@@ -236,7 +240,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
 
    unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack );
 
-   const cudaStream_t & packStream = communicationStream_;
+   const gpuStream_t & packStream = communicationStream_;
 
    unsigned char * copyBufferPtr = outBufferPtr;
    if ( copyAsync_ )
@@ -274,7 +278,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( packStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( packStream ) )
 
       std::copy( copyBufferPtr, static_cast<unsigned char *>( copyBufferPtr + nrOfBytesToPack ), outBufferPtr );
    }
@@ -290,7 +294,7 @@ uint_t GPUPackInfo<GPUField_T>::numberOfGhostLayersToCommunicate( const GPUField
    }
    else
    {
-      WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() );
+      WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() )
       return numberOfGhostLayers_;
    }
 }
@@ -298,5 +302,5 @@ uint_t GPUPackInfo<GPUField_T>::numberOfGhostLayersToCommunicate( const GPUField
 
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/communication/GeneratedGPUPackInfo.h b/src/gpu/communication/GeneratedGPUPackInfo.h
similarity index 88%
rename from src/cuda/communication/GeneratedGPUPackInfo.h
rename to src/gpu/communication/GeneratedGPUPackInfo.h
index 752f2907c734cbb1a18d73b390424d33c94f3aa8..9ca4afb9c575446ea8734d0cbf302819b5160b05 100644
--- a/src/cuda/communication/GeneratedGPUPackInfo.h
+++ b/src/gpu/communication/GeneratedGPUPackInfo.h
@@ -14,31 +14,33 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GeneratedGPUPackInfo.h
-//! \ingroup core
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 
 #pragma once
-#include "stencil/Directions.h"
 #include "domain_decomposition/IBlock.h"
-#include <cuda_runtime.h>
 
+#include "stencil/Directions.h"
+
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 class GeneratedGPUPackInfo
 {
 public:
-   virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
-   virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
+   virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
+   virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
    virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
 };
 
 
 
-} //namespace cuda
+} //namespace gpu
 } //namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h
similarity index 62%
rename from src/cuda/communication/MemcpyPackInfo.h
rename to src/gpu/communication/MemcpyPackInfo.h
index 20637b51a36385b21e49cd257a4a41bdaf1ea0a8..c5e58d2a395d71337409f9e8da4b5e039a79dd6c 100644
--- a/src/cuda/communication/MemcpyPackInfo.h
+++ b/src/gpu/communication/MemcpyPackInfo.h
@@ -1,40 +1,43 @@
 #pragma once
 
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
 #include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
 #include "domain_decomposition/IBlock.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
 
+#include "stencil/Directions.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 template<typename GPUFieldType>
-class MemcpyPackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
+class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
 {
 public:
     MemcpyPackInfo( BlockDataID pdfsID_ )
-        : pdfsID(pdfsID_), numberOfGhostLayers_(0), communicateAllGhostLayers_(true)
-    {};
+        : pdfsID(pdfsID_) {};
     virtual ~MemcpyPackInfo() = default;
 
-    void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream) override;
-    void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream) override;
+    void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
+    void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
     uint_t size(stencil::Direction dir, IBlock * block) override;
 
 private:
     BlockDataID pdfsID;
-    uint_t numberOfGhostLayers_;
-    bool communicateAllGhostLayers_;
+    uint_t numberOfGhostLayers_{0};
+    bool communicateAllGhostLayers_{true};
 
     uint_t numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const;
 };
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 #include "MemcpyPackInfo.impl.h"
diff --git a/src/cuda/communication/MemcpyPackInfo.impl.h b/src/gpu/communication/MemcpyPackInfo.impl.h
similarity index 93%
rename from src/cuda/communication/MemcpyPackInfo.impl.h
rename to src/gpu/communication/MemcpyPackInfo.impl.h
index b75587c5bcdcef1ce38e06f58db53339095ce7f8..486871d4e0e7563e8b890b91bfc5aa814775d74a 100644
--- a/src/cuda/communication/MemcpyPackInfo.impl.h
+++ b/src/gpu/communication/MemcpyPackInfo.impl.h
@@ -3,20 +3,20 @@
 #include "field/Layout.h"
 #include "stencil/Directions.h"
 #include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
-#include "cuda/GPUCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUCopy.h"
 #include "core/DataTypes.h"
 #include "MemcpyPackInfo.h"
-#include <cuda_runtime.h>
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char * byte_buffer,
-                                          IBlock * block, cudaStream_t stream)
+                                          IBlock * block, gpuStream_t stream)
 {
    // Extract field data pointer from the block
    const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
@@ -41,7 +41,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
       const uint_t dstAllocSizeZ = fieldCi.zSize();
       const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.ySize() );
@@ -55,7 +55,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
       const uint_t dstAllocSizeZ = fieldCi.ySize();
       const uint_t srcAllocSizeZ = fieldPtr->yAllocSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() );
@@ -67,7 +67,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
 
 template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer,
-                                            IBlock * block, cudaStream_t stream)
+                                            IBlock * block, gpuStream_t stream)
 {
    GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
    WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
@@ -89,7 +89,7 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
       const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
       const uint_t srcAllocSizeZ = fieldCi.zSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.ySize() );
@@ -102,7 +102,7 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
    {
       const uint_t dstAllocSizeY = fieldPtr->yAllocSize();
       const uint_t srcAllocSizeY = fieldCi.ySize();
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() );
@@ -227,5 +227,5 @@ uint_t MemcpyPackInfo< GPUFieldType >::numberOfGhostLayersToCommunicate( const G
 }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
similarity index 77%
rename from src/cuda/communication/UniformGPUScheme.h
rename to src/gpu/communication/UniformGPUScheme.h
index 173cfcc4c44166f7ff05e8963fbe7135123aba40..e53e6772b4ccd5a3e04da5a632cbafd8ceb206d3 100644
--- a/src/cuda/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file UniformGPUScheme.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,20 +22,25 @@
 #pragma once
 
 #include "blockforest/StructuredBlockForest.h"
-#include "core/mpi/MPIWrapper.h"
+
 #include "core/mpi/BufferSystem.h"
+#include "core/mpi/MPIWrapper.h"
+
 #include "domain_decomposition/IBlock.h"
-#include "stencil/Directions.h"
 
-#include "cuda/CudaRAII.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
-#include "cuda/communication/CustomMemoryBuffer.h"
-#include "cuda/ParallelStreams.h"
+#include "stencil/Directions.h"
 
 #include <thread>
 
+#include "gpu/CudaRAII.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -56,11 +61,16 @@ namespace communication {
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( cudaStream_t stream = nullptr);
-       void wait( cudaStream_t stream = nullptr);
+       void startCommunication( gpuStream_t stream = nullptr);
+       void wait( gpuStream_t stream = nullptr);
+
+       void operator()( gpuStream_t stream = nullptr )         { communicate( stream ); }
+       inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
+
+       std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr );
 
-      void operator()( cudaStream_t stream = nullptr )         { communicate( stream ); }
-      inline void communicate( cudaStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
 
    private:
        void setupCommunication();
@@ -72,8 +82,8 @@ namespace communication {
        bool communicationInProgress_;
        bool sendFromGPU_;
 
-       using CpuBuffer_T = cuda::communication::PinnedMemoryBuffer;
-       using GpuBuffer_T = cuda::communication::GPUMemoryBuffer;
+       using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer;
+       using GpuBuffer_T = gpu::communication::GPUMemoryBuffer;
 
        mpi::GenericBufferSystem<CpuBuffer_T, CpuBuffer_T> bufferSystemCPU_;
        mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_;
@@ -95,7 +105,7 @@ namespace communication {
 
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 #include "UniformGPUScheme.impl.h"
diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
similarity index 90%
rename from src/cuda/communication/UniformGPUScheme.impl.h
rename to src/gpu/communication/UniformGPUScheme.impl.h
index 089f03e78ec30b5ff7d2ca451ba0f82e41bcc0c8..c8e81cb23e14e15e7fd79b5ec7fb052137b600a9 100644
--- a/src/cuda/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -14,15 +14,16 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file UniformGPUScheme.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -60,7 +61,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream )
+   void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream )
    {
       WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
@@ -112,7 +113,7 @@ namespace communication {
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                         WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
-                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ))
+                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s ))
                      }
                   });
                }
@@ -121,7 +122,7 @@ namespace communication {
       }
 
       // wait for packing to finish
-      cudaStreamSynchronize( stream );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) );
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -133,7 +134,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait( cudaStream_t stream )
+   void UniformGPUScheme<Stencil>::wait( gpuStream_t stream )
    {
       WALBERLA_ASSERT( communicationInProgress_ )
 
@@ -182,8 +183,8 @@ namespace communication {
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
 
                   parallelSection.run([&](auto s) {
-                     WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           cudaMemcpyHostToDevice, s ))
+                     WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
+                                                           gpuMemcpyHostToDevice, s ))
                      pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                   });
                }
@@ -273,7 +274,24 @@ namespace communication {
       setupBeforeNextCommunication_ = true;
    }
 
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { communicate( stream ); };
+   }
+
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { startCommunication( stream ); };
+   }
+
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { wait( stream ); };
+   }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/doc/drawing.svg b/src/gpu/doc/drawing.svg
similarity index 99%
rename from src/cuda/doc/drawing.svg
rename to src/gpu/doc/drawing.svg
index 4e356d3f301c16035e3c87dbbb7674d6af2459e6..b931580f55ad6368062073e681c708f95fe9bab4 100644
--- a/src/cuda/doc/drawing.svg
+++ b/src/gpu/doc/drawing.svg
@@ -135,7 +135,7 @@
          id="tspan3761-6"
          x="50"
          y="222.36218"
-         style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">cuda::GPUField</tspan></text>
+         style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">gpu::GPUField</tspan></text>
     <rect
        style="fill:#c7ffea;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.71999997px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
        id="rect3757-0"
diff --git a/src/cuda/doc/fieldAccess.png b/src/gpu/doc/fieldAccess.png
similarity index 100%
rename from src/cuda/doc/fieldAccess.png
rename to src/gpu/doc/fieldAccess.png
diff --git a/src/gpu/doc/gpu.dox b/src/gpu/doc/gpu.dox
new file mode 100644
index 0000000000000000000000000000000000000000..83f8e78dc9b5dd448f22027f94c68ab798458f01
--- /dev/null
+++ b/src/gpu/doc/gpu.dox
@@ -0,0 +1,108 @@
+
+namespace walberla{
+/*!
+
+\page gpuPage Introduction to GPU Programming with waLBerla
+
+WaLBerla is a high-performance computing framework that supports GPU computing using either CUDA or HIP.
+In this tutorial, we will provide an overview of the GPU concepts in WaLBerla and show you how to create GPU fields
+and write GPU kernels using the provided indexing strategies.
+
+\section gpuBasicWrapper Basics
+
+waLBerla supports GPUs through a simple wrapper around both CUDA and HIP libraries.
+This allows users to write GPU-accelerated code that can run on both NVIDIA and AMD GPUs. In the following we will explain
+the concept with a simple example on how to allocate memory on GPUs. To create and manage GPU memory in waLBerla,
+the gpuMalloc function should be used always, which is defined depending on the build system used to compile waLBerla.
+Specifically, if waLBerla was built with CUDA, `gpuMalloc` is defined as `cudaMalloc`, while if it was built with HIP,
+`gpuMalloc` is defined as `hipMalloc`. This allows users to write GPU-accelerated code that can run on both NVIDIA and AMD GPUs.
+Here's an example of how to create a GPU array of 100 floats and set its values to zero using waLBerla:
+
+\code
+#include "gpu/GPUWrapper.h"
+
+int main()
+{
+  float* d_array;
+  gpuMalloc((void**)&d_array, 100 * sizeof(float));
+  gpuMemset(d_array, 0, 100 * sizeof(float));
+  // ...
+  return 0;
+}
+\endcode
+
+In conclusion, waLBerla provides a simple wrapper around both CUDA and HIP libraries to allow users to write
+GPU-accelerated code that can run on both NVIDIA and AMD GPUs. This wrapper is used through the entire backend of waLBerla
+and thus for all higher level functionality. As a user most of the time the higher level functionality will be used
+and the wrapper is more important for developers. As a next step and introduction to some of the higher level functionality follows.
+
+\section gpuField Creating and Copying GPU Fields
+
+To create a GPU field in WaLBerla, you can use the gpu::GPUField class, which is similar to the field::GhostLayerField class used for CPU fields.
+You can copy data between the host and device using the gpu::fieldCpy function, as shown in the following example:
+
+
+\subsection gpuFieldOverview Creating GPU fields and copy them between host and device
+
+   \code
+    GhostLayerField<double,4> h_f(16, 20, 30, 1, 42.0, field::fzyx);
+    gpu::GPUField<double> d_f(16, 20, 30, 4, 1, field::fzyx);
+    gpu::fieldCpy(d_f, h_f); // copy from host to device
+    some_kernel_wrapper(d_f); // run some kernel
+    gpu::fieldCpy(h_f, d_f); // copy field data back to host
+
+   \endcode
+
+Note that gpu::GPUField has a template parameter for the number of fields (or channels), whereas field::GhostLayerField
+has a template parameter for the size of each field. Also, GPU fields can be accessed using gpu::FieldAccessor objects, which we will discuss next.
+
+\section gpuKernels Writing and Executing GPU Kernels
+
+\subsection gpuFieldAccess Writing GPU Kernels with Indexing Strategies
+
+  \image html gpu/doc/fieldAccess.png "Accessing fields in CUDA kernels"
+
+   When writing a kernel that operates on a field, the first task is to distribute the data to threads and blocks.
+   We need a function $(blockIdx, threadIdx) \\rightarrow (x,y,z)$ or $(blockIdx, threadIdx) \\rightarrow (x,y,z,f)$.
+   The optimal mapping depends on many parameters: for example which layout the field has, the extends of each coordinate,
+   hardware parameters like warp-size, etc.
+   Thus this indexing function is abstracted. A few indexing strategies are already implemented which can be
+   substituted by custom strategies.
+   A indexing strategy consists of two classes: and somewhat complex Indexing class, which manages the
+   indexing on the host-side and a lightweight Accessor class, which is passed to the GPU kernel.
+
+   An indexing scheme is very similar to the iterator concept, it defines the bounds of the iteration, which is not necessarily the
+   complete field but could also be a certain sub-block, for example the ghost layer in a certain direction.
+
+
+   Lets start to write a simple kernel that doubles all values stored in a field:
+   \code
+   #include "gpu/FieldAccessor.h"
+
+   __global__ void kernel_double( gpu::FieldAccessor<double> f )
+   {
+      f.set( blockIdx, threadIdx );
+      f.get() *= 2.0;
+   }
+   \endcode
+   We do not have to care about indexing, the gpu::FieldAccessor takes care of that. So this is a generic kernel that operates
+   on double fields. Using the gpu::FieldAccessor the current and neighboring values can be accessed and manipulated.
+
+   This kernel can be called like this:
+
+   \code
+   gpu::FieldIndexing<double> indexing = gpu::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true );
+   kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() );
+   \endcode
+
+   In the example above we only iterate over a slice of the field. Of course we can also iterate over the complete field, there are
+   various static member functions in a Indexing class to create certain iteration patterns.
+   The Indexing class encapsulates the information of how to launch the kernel (blockDim and gridDim) and holds the Accessor class that
+   is passed to the kernel.
+
+   Two indexing strategies are currently provided:
+      - gpu::FieldIndexing   and  gpu::FieldAccessor (general, but slow )
+      - gpu::FieldIndexingXYZ  and gpu::FieldAccessorXYZ ( optimized for cell based iterating over bigger chunks, for fields where xSize bigger than warpSize )
+
+*/
+}
diff --git a/src/cuda/ideasForCommunication.txt b/src/gpu/ideasForCommunication.txt
similarity index 100%
rename from src/cuda/ideasForCommunication.txt
rename to src/gpu/ideasForCommunication.txt
diff --git a/src/cuda/lbm/CMakeLists.txt b/src/gpu/lbm/CMakeLists.txt
similarity index 72%
rename from src/cuda/lbm/CMakeLists.txt
rename to src/gpu/lbm/CMakeLists.txt
index a2db712aa018fd306019a4c43bf177f536f0c2f9..ae7f60ac849311c4b01a7290b7c0b57d31f06ffe 100644
--- a/src/cuda/lbm/CMakeLists.txt
+++ b/src/gpu/lbm/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     CombinedInPlaceGpuPackInfo.h     
     )
diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
similarity index 91%
rename from src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
rename to src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
index c47d815c111efb82e2d199f17763d620744397b4..cabae3221ac6d40be432f6ab3b6a9179736b10a6 100644
--- a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
+++ b/src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
@@ -22,15 +22,16 @@
 
 #define IS_EVEN(x) ((x & 1) ^ 1)
 
-#include "cuda/communication/GeneratedGPUPackInfo.h"
-
 #include "lbm/inplace_streaming/TimestepTracker.h"
 
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
 namespace walberla {
 namespace lbm {
 
 template< typename EvenPackInfo, typename OddPackInfo >
-class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
+class CombinedInPlaceGpuPackInfo : public gpu::GeneratedGPUPackInfo
 {
  public:
    template< typename... Args >
@@ -40,7 +41,7 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
 
    virtual ~CombinedInPlaceGpuPackInfo() = default;
 
-   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override
    {
       if (IS_EVEN(tracker_->getCounter()))
       {
@@ -52,7 +53,7 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
       }
    }
 
-   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override {
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override {
       if (IS_EVEN(tracker_->getCounter()))
       {
          evenPackInfo_.unpack(dir, buffer, block, stream);
diff --git a/src/cuda/sweeps/CMakeLists.txt b/src/gpu/sweeps/CMakeLists.txt
similarity index 66%
rename from src/cuda/sweeps/CMakeLists.txt
rename to src/gpu/sweeps/CMakeLists.txt
index 188a4cbae837e9e649f21c242f7a33e4fdbcc7ff..2126d798ceaa54823dfced6ad447039f0121f4b2 100644
--- a/src/cuda/sweeps/CMakeLists.txt
+++ b/src/gpu/sweeps/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     GPUSweepBase.h     
     )
diff --git a/src/cuda/sweeps/GPUSweepBase.h b/src/gpu/sweeps/GPUSweepBase.h
similarity index 92%
rename from src/cuda/sweeps/GPUSweepBase.h
rename to src/gpu/sweeps/GPUSweepBase.h
index fbd5e2f8e6ff688a95c2e13425f58ff49085b8c9..f8e61dd14fa58246a4b28e65fa3e8edf6a9a8774 100644
--- a/src/cuda/sweeps/GPUSweepBase.h
+++ b/src/gpu/sweeps/GPUSweepBase.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUSweepBase.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -22,7 +22,7 @@
 #pragma once
 
 
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
 
 #include "core/debug/Debug.h"
 
@@ -31,16 +31,15 @@
 #include <set>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template < typename GPUField_T >
 class GPUSweepBase
 {
 public:
-   GPUSweepBase()
-   {
-   }
+   GPUSweepBase() = default;
    virtual ~GPUSweepBase()
    {
       for( auto field = dstFields_.begin(); field != dstFields_.end(); ++field )
@@ -58,7 +57,7 @@ public:
       }
 
       GPUField_T * dst = src->cloneUninitialized();
-      WALBERLA_ASSERT_NOT_NULLPTR( dst );
+      WALBERLA_ASSERT_NOT_NULLPTR( dst )
 
       dstFields_.insert( dst );
 
@@ -71,6 +70,6 @@ protected:
 };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
diff --git a/src/python_coupling/CMakeLists.txt b/src/python_coupling/CMakeLists.txt
index 08266d3a9a51a452ba7022de56994f935fa7861c..309a7cb9a451fe2b9d64347851334a158f7e6a63 100644
--- a/src/python_coupling/CMakeLists.txt
+++ b/src/python_coupling/CMakeLists.txt
@@ -1,7 +1,7 @@
  add_library( python_coupling )
  target_link_libraries( python_coupling PUBLIC pybind11::embed core communication domain_decomposition stencil field blockforest vtk )
- if( WALBERLA_BUILD_WITH_CUDA )
-  target_link_libraries( python_coupling PUBLIC cuda )
+ if( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+  target_link_libraries( python_coupling PUBLIC gpu )
  endif()
  target_sources( python_coupling
        PRIVATE
diff --git a/src/python_coupling/export/CMakeLists.txt b/src/python_coupling/export/CMakeLists.txt
index bea431188570ec668ce21d5d50ebf2d372c81591..a7b38929727c181622e0615baf877fb1d38e1abf 100644
--- a/src/python_coupling/export/CMakeLists.txt
+++ b/src/python_coupling/export/CMakeLists.txt
@@ -2,7 +2,8 @@ target_sources( python_coupling
     PRIVATE
     GatherExport.impl.h
     VTKExport.cpp
-    CUDAExport.h
+    GPUExport.h
+    GPUExport.impl.h
     FieldCommunicationExport.impl.h
     BasicExport.cpp
     BlockForestCommunicationExport.h
@@ -16,5 +17,4 @@ target_sources( python_coupling
     BasicExport.h
     FieldExport.impl.h
     FieldExports.h
-    CUDAExport.impl.h     
     )
diff --git a/src/python_coupling/export/CUDAExport.h b/src/python_coupling/export/GPUExport.h
similarity index 92%
rename from src/python_coupling/export/CUDAExport.h
rename to src/python_coupling/export/GPUExport.h
index 505aa3368e3008dec45eecf5acd9f3a231bc7b6e..6976a8c6ac51f41ad02ca4b66171cb568ae75e80 100644
--- a/src/python_coupling/export/CUDAExport.h
+++ b/src/python_coupling/export/GPUExport.h
@@ -13,8 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CUDAExport.h
-//! \ingroup cuda
+//! \file GPUExport.h
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
@@ -26,7 +26,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename... GpuFields>
@@ -36,9 +37,9 @@ namespace cuda {
    void exportCopyFunctionsToPython(py::module_ &m);
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
-#include "CUDAExport.impl.h"
+#include "GPUExport.impl.h"
 
 #endif //WALBERLA_BUILD_WITH_PYTHON
diff --git a/src/python_coupling/export/CUDAExport.impl.h b/src/python_coupling/export/GPUExport.impl.h
similarity index 92%
rename from src/python_coupling/export/CUDAExport.impl.h
rename to src/python_coupling/export/GPUExport.impl.h
index 0e724cebe45f7a420d3beb44b2daef50e2a16ca7..cffbc245e985ba208b50569a2bfc3125f61c0e6a 100644
--- a/src/python_coupling/export/CUDAExport.impl.h
+++ b/src/python_coupling/export/GPUExport.impl.h
@@ -13,8 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CUDAExport.impl.h
-//! \ingroup cuda
+//! \file GPUExport.impl.h
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
@@ -23,10 +23,10 @@
 // Do not reorder includes - the include order is important
 #include "core/logging/Logging.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/communication/GPUPackInfo.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/communication/GPUPackInfo.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -35,7 +35,8 @@
 #include "python_coupling/helper/MplHelpers.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
@@ -70,7 +71,7 @@ using namespace pybind11::literals;
          typedef typename GpuField_T::value_type T;
          std::string data_type_name = field::internal::PythonFormatString<T>::get();
 
-         std::string class_name = "GpuField_" + data_type_name;
+         std::string const class_name = "GpuField_" + data_type_name;
          py::class_<GpuField_T, shared_ptr<GpuField_T>>(m_, class_name.c_str() )
             .def_property_readonly("layout",              &field::internal::field_layout            < GpuField_T > )
             .def_property_readonly("size",                &field::internal::field_size              < GpuField_T > )
@@ -89,11 +90,11 @@ using namespace pybind11::literals;
 
          using field::communication::PackInfo;
          using communication::GPUPackInfo;
-         std::string GpuFieldPackInfoName = "GpuFieldPackInfo_" + data_type_name;
+         std::string const GpuFieldPackInfoName = "GpuFieldPackInfo_" + data_type_name;
          py::class_< GPUPackInfo<GpuField_T>, shared_ptr< GPUPackInfo<GpuField_T> >, walberla::communication::UniformPackInfo>(m_, GpuFieldPackInfoName.c_str() );
 
          using field::communication::UniformMPIDatatypeInfo;
-         std::string GpuFieldMPIDataTypeInfoName = "GpuFieldMPIDataTypeInfo_" + data_type_name;
+         std::string const GpuFieldMPIDataTypeInfoName = "GpuFieldMPIDataTypeInfo_" + data_type_name;
          py::class_< UniformMPIDatatypeInfo<GpuField_T>, shared_ptr< UniformMPIDatatypeInfo<GpuField_T> >, walberla::communication::UniformMPIDatatypeInfo>(m_, GpuFieldMPIDataTypeInfoName.c_str() );
 
       }
@@ -167,7 +168,7 @@ using namespace pybind11::literals;
       template< typename GpuField_T>
       void operator() ( python_coupling::NonCopyableWrap<GpuField_T> )
       {
-         using cuda::communication::GPUPackInfo;
+         using gpu::communication::GPUPackInfo;
 
          IBlock * firstBlock =  & ( * blocks_->begin() );
          if( firstBlock->isDataClassOrSubclassOf<GpuField_T>(fieldId_) )
@@ -199,7 +200,7 @@ using namespace pybind11::literals;
    static py::object PackInfoWrapper(const shared_ptr<StructuredBlockForest> & blocks,
                                      const std::string & name, uint_t numberOfGhostLayers )
    {
-      using cuda::communication::GPUPackInfo;
+      using gpu::communication::GPUPackInfo;
       BlockDataID fieldID = python_coupling::blockDataIDFromString( *blocks, name );
 
       if ( blocks->begin() == blocks->end() ) {
@@ -296,15 +297,15 @@ class copyFieldToGpuDispatchExporter
    template< typename CpuField_T>
    void operator() ( python_coupling::NonCopyableWrap<CpuField_T> )
    {
-      typedef cuda::GPUField<typename CpuField_T::value_type> GpuField_T;
+      typedef gpu::GPUField<typename CpuField_T::value_type> GpuField_T;
       IBlock * firstBlock =  & ( * blocks_->begin() );
 
       if(firstBlock->isDataClassOrSubclassOf< CpuField_T > ( cpuFieldId_ ) )
       {
          if(toGPU_)
-           cuda::fieldCpy<GpuField_T, CpuField_T>(blocks_, gpuFieldId_, cpuFieldId_);
+               gpu::fieldCpy<GpuField_T, CpuField_T>(blocks_, gpuFieldId_, cpuFieldId_);
          else
-           cuda::fieldCpy<CpuField_T, GpuField_T>(blocks_, cpuFieldId_, gpuFieldId_);
+               gpu::fieldCpy<CpuField_T, GpuField_T>(blocks_, cpuFieldId_, gpuFieldId_);
       }
    }
  private:
@@ -321,8 +322,8 @@ void copyFieldToGPU(const shared_ptr< StructuredBlockForest > & blocks, const st
    namespace py = pybind11;
    auto result = make_shared<py::object>();
 
-   BlockDataID gpuFieldId = python_coupling::blockDataIDFromString( *blocks, gpuFieldName );
-   BlockDataID cpuFieldId = python_coupling::blockDataIDFromString( *blocks, cpuFieldName );
+   BlockDataID const gpuFieldId = python_coupling::blockDataIDFromString( *blocks, gpuFieldName );
+   BlockDataID const cpuFieldId = python_coupling::blockDataIDFromString( *blocks, cpuFieldName );
 
    copyFieldToGpuDispatchExporter exporter( blocks, gpuFieldId, cpuFieldId, toGPU );
    python_coupling::for_each_noncopyable_type<CpuFields...>( std::ref(exporter) );
@@ -335,7 +336,7 @@ using namespace pybind11::literals;
 template<typename... GpuFields>
 void exportModuleToPython(py::module_ &m)
 {
-   py::module_ m2 = m.def_submodule("cuda", "Cuda Extension of the waLBerla python bindings");
+   py::module_ m2 = m.def_submodule("gpu", "GPU (CUDA / HIP) Extension of the waLBerla python bindings");
 
    python_coupling::for_each_noncopyable_type<GpuFields...>( internal::GpuFieldExporter(m2) );
 
@@ -368,7 +369,7 @@ void exportModuleToPython(py::module_ &m)
 template<typename... CpuFields >
 void exportCopyFunctionsToPython(py::module_ &m)
 {
-     py::module_ m2 = m.def_submodule("cuda", "Cuda Extension of the waLBerla python bindings");
+     py::module_ m2 = m.def_submodule("gpu", "GPU (CUDA / HIP) Extension of the waLBerla python bindings");
 
    m2.def(
       "copyFieldToGpu",
@@ -388,7 +389,7 @@ void exportCopyFunctionsToPython(py::module_ &m)
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/waLBerlaDefinitions.in.h b/src/waLBerlaDefinitions.in.h
index 4705c5f914296c2298e2e7e61eb2cbb71a69808f..ea9dfee9179c8dbd8ec92e943216f25a77842180 100644
--- a/src/waLBerlaDefinitions.in.h
+++ b/src/waLBerlaDefinitions.in.h
@@ -34,6 +34,8 @@
 #cmakedefine WALBERLA_MESAPD_CONVEX_POLYHEDRON_AVAILABLE
 
 #cmakedefine WALBERLA_BUILD_WITH_CUDA
+#cmakedefine WALBERLA_BUILD_WITH_HIP
+#cmakedefine WALBERLA_BUILD_WITH_GPU_SUPPORT
 
 #cmakedefine WALBERLA_BUILD_WITH_CODEGEN
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b7032214d314cae434db0a84da4eace9bdf30c52..94efcd3ae7e6607d6528c0bf2bd50a884cda4810 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,7 +5,7 @@ include_directories( ${walberla_BINARY_DIR}/src ) # for generated headers
 add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( core )
-add_subdirectory( cuda )
+add_subdirectory( gpu )
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
 add_subdirectory( fft )
diff --git a/tests/cuda/communication/CommTest.cpp b/tests/cuda/communication/CommTest.cpp
deleted file mode 100644
index 8233ac5615a106413516627720ee563e98b4fe0a..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/CommTest.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//
-//======================================================================================================================
-
-
-#include "core/debug/TestSubsystem.h"
-#include "core/Environment.h"
-#include "core/mpi/Datatype.h"
-
-#include "field/communication/MPIDatatypes.h"
-#include "field/Field.h"
-
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-
-#define NUM_ITER  100
-#define SIZE_X    16
-#define SIZE_Y    16
-#define SIZE_Z    16
-#define LAYOUT    field::fzyx
-
-
-using namespace walberla;
-
-void hostToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0, LAYOUT);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		hostField2.set(hostField1);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void hostToDevice()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		cuda::fieldCpy(deviceField, hostField);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void deviceToHost()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-	cuda::fieldCpy(deviceField, hostField);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		cuda::fieldCpy(hostField, deviceField);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiHostToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiHostToDevice()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) );
-	auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiDeviceToHost()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) );
-	auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiDeviceToDevice()
-{
-	cuda::GPUField<double> deviceField1(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-	cuda::GPUField<double> deviceField2(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto deviceDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( deviceField1 ) );
-	auto deviceDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( deviceField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( deviceField1.data(), 1, deviceDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( deviceField2.data(), 1, deviceDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiCopyHostToDevice()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-
-		cuda::fieldCpy(deviceField, hostField2);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiCopyDeviceToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		cuda::fieldCpy(hostField1, deviceField);
-
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   walberla::Environment walberlaEnv( argc, argv );
-
-	WALBERLA_CHECK_EQUAL(MPIManager::instance()->numProcesses(), 2);
-
-   hostToHost();
-   hostToDevice();
-   deviceToHost();
-   mpiHostToHost();
-   mpiHostToDevice();
-   mpiDeviceToHost();
-   mpiDeviceToDevice();
-   mpiCopyHostToDevice();
-   mpiCopyDeviceToHost();
-
-   return 0;
-}
diff --git a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
deleted file mode 100644
index 8e877874ef81051b3c55d272b50227cc65f3071f..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//========================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUFieldPackInfoTest.cpp
-//! \ingroup cuda
-//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
-//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
-//
-//========================================================================================================================
-
-#include "core/DataTypes.h"
-#include "core/debug/TestSubsystem.h"
-#include "core/math/Random.h"
-#include "core/mpi/Environment.h"
-
-#include "stencil/Directions.h"
-#include "stencil/Iterator.h"
-#include "stencil/D3Q27.h"
-
-#include "domain_decomposition/BlockDataID.h"
-
-#include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-
-#include "field/GhostLayerField.h"
-
-#include "cuda/ErrorChecking.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/GPUPackInfo.h"
-
-#include <cuda_runtime.h>
-#include <vector>
-
-using namespace walberla;
-
-using DataType = walberla::uint_t;
-using StencilType = stencil::D3Q27;
-using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
-using GPUFieldType = cuda::GPUField< DataType >;
-using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
-using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
-
-static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::fzyx };
-static uint_t fieldLayoutIndex = 0;
-
-
-FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new FieldType(
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            DataType(0),                            // initial value
-            fieldLayouts[fieldLayoutIndex],         // layout
-            make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
-            );
-}
-
-
-GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new GPUFieldType(
-            storage->getNumberOfXCells( *block ), // number of cells in x direction
-            storage->getNumberOfYCells( *block ), // number of cells in y direction
-            storage->getNumberOfZCells( *block ), // number of cells in z direction
-            StencilType::Size,                    // number of cells for pdfs
-            1,                                    // one ghost layer
-            fieldLayouts[fieldLayoutIndex] );
-}
-
-
-void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
-{
-   for( auto block = blocks->begin(); block != blocks->end(); ++block )
-   {
-      auto fieldPtr = block->getData< FieldType >( fieldID );
-
-      for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
-         *fieldIt = math::intRandom< DataType >();
-   }
-}
-
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   mpi::Environment mpiEnv( argc, argv );
-
-
-   const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
-
-   uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
-
-   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
-   {
-      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
-                                                        cells[0], cells[1], cells[2], // cells
-                                                        1,                            // unit cell spacing
-                                                        true,                        // one block per process
-                                                        true, true, true);            // periodic in all directions
-
-      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
-                                                                               "ScalarField" );
-
-      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
-                                                                                   "syncGPUField" );
-
-      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
-                                                                                    "asyncGPUField" );
-
-      math::seedRandomGenerator( numeric_cast<std::mt19937::result_type>( MPIManager::instance()->rank() ) );
-      // Initialize CPU field with random values
-      initFields( blocks, sourceFieldId );
-
-      // Copy same CPU field to both GPU fields
-      for( auto block = blocks->begin(); block != blocks->end(); ++block )
-      {
-         auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
-
-         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
-         cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
-
-         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
-         cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
-      }
-
-      // Setup communication schemes for synchronous GPUPackInfo
-      CommSchemeType syncCommScheme(blocks);
-      syncCommScheme.addPackInfo( make_shared< GPUPackInfoType >( syncGPUFieldId ) );
-
-      // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
-      CommSchemeType asyncCommScheme(blocks);
-      asyncCommScheme.addPackInfo( make_shared< GPUPackInfoType >( asyncGPUFieldId ) );
-
-      // Perform one communication step for each scheme
-      syncCommScheme();
-      asyncCommScheme();
-
-      // Check results
-      FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
-                              make_shared< cuda::HostFieldAllocator< DataType > >() );
-      FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
-                               make_shared< cuda::HostFieldAllocator< DataType > >() );
-
-      for( auto block = blocks->begin(); block != blocks->end(); ++block )
-      {
-         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
-         cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
-
-         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
-         cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
-
-         for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
-                  syncIt != syncFieldCpu.end();
-                  ++syncIt, ++asyncIt )
-            WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
-      }
-   }
-
-
-   return EXIT_SUCCESS;
-}
diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp
deleted file mode 100644
index 0cafd76f5178022de70a3cf3d96e0fc2f139e7b5..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/GPUPackInfoTest.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUFieldPackInfoTest.cpp
-//! \ingroup cuda
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//! \brief Tests if a GPUField is correctly packed into buffers
-//
-//======================================================================================================================
-
-#include "field/GhostLayerField.h"
-
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/GPUPackInfo.h"
-
-#include "blockforest/Initialization.h"
-
-#include "core/debug/TestSubsystem.h"
-#include "core/mpi/MPIManager.h"
-
-#include "stencil/D3Q27.h"
-
-#include <cstring>
-#include <vector>
-#include <cuda_runtime.h>
-
-#define F_SIZE    19
-
-using namespace walberla;
-
-static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
-static uint_t fieldLayoutIndex = 0;
-
-cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new cuda::GPUField<int> (
-            storage->getNumberOfXCells( *block ), // number of cells in x direction
-            storage->getNumberOfYCells( *block ), // number of cells in y direction
-            storage->getNumberOfZCells( *block ), // number of cells in z direction
-            F_SIZE,                               // fSize
-            1,                                    // number of ghost layers
-            fieldLayouts[fieldLayoutIndex] );
-}
-
-// Tester base class. The communicate() template method allows testing different communication methods.
-class GPUPackInfoTester
-{
-public:
-
-   typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
-
-   GPUPackInfoTester( IBlock* block, BlockDataID fieldId ) :
-      block_( block ), fieldId_( fieldId ) {}
-
-   virtual ~GPUPackInfoTester() {}
-
-   void test( stencil::Direction dir )
-   {
-      cuda::GPUField<int> & gpuField = *(block_->getData<cuda::GPUField<int> >( fieldId_ ));
-
-      field::GhostLayerField<int,F_SIZE> cpuField(
-               gpuField.xSize(),       // number of cells in x direction
-               gpuField.ySize(),       // number of cells in y direction
-               gpuField.zSize(),       // number of cells in z direction
-               1,                      // number of ghost layers
-               0,                      // initial value
-               fieldLayouts[fieldLayoutIndex]);
-      cpuField.setWithGhostLayer( 0 );
-
-      int val = 0;
-      for ( auto it = cpuField.beginSliceBeforeGhostLayer( dir ); it != cpuField.end(); ++it )
-      {
-         *it = ++val;
-      }
-      cuda::fieldCpy( gpuField, cpuField );
-
-      GPUPackInfoType gpuPackInfo( fieldId_ );
-
-      communicate( gpuPackInfo, dir );
-      cuda::fieldCpy( cpuField, gpuField );
-
-      val = 0;
-      for ( auto it = cpuField.beginGhostLayerOnly( stencil::inverseDir[dir] ); it != cpuField.end(); ++it )
-      {
-         WALBERLA_CHECK_EQUAL( *it, ++val );
-      }
-
-   }
-
-protected:
-
-   virtual void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) = 0;
-
-   IBlock* block_;
-   BlockDataID fieldId_;
-};
-
-
-// Tester for buffer communication
-class GPUPackInfoBufferTester: public GPUPackInfoTester
-{
-public:
-   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId): GPUPackInfoTester( block, fieldId ) {}
-
-protected:
-   void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
-   {
-      mpi::GenericSendBuffer<> sendBuf;
-      sendBuf.addDebugMarker( "Be" );
-      gpuPackInfo.packData( block_, dir, sendBuf );
-      sendBuf.addDebugMarker( "Af" );
-
-      // Manually copy over the send to the receive buffer
-      mpi::GenericRecvBuffer<> recvBuf;
-      recvBuf.resize( sendBuf.size() );
-      memcpy( recvBuf.ptr(), sendBuf.ptr(), sendBuf.size() * sizeof(mpi::GenericSendBuffer<>::ElementType) );
-
-      recvBuf.readDebugMarker( "Be" );
-      gpuPackInfo.unpackData( block_,  stencil::inverseDir[dir], recvBuf );
-      recvBuf.readDebugMarker( "Af" );
-   }
-};
-
-
-// Tester for local communication
-class GPUPackInfoLocalTester: public GPUPackInfoTester
-{
-public:
-   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
-
-protected:
-   void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
-   {
-      gpuPackInfo.communicateLocal( block_, block_, dir );
-   }
-};
-
-
-int main(int argc, char **argv)
-{
-   using blockforest::createUniformBlockGrid;
-
-   debug::enterTestMode();
-   MPIManager::instance()->initializeMPI(&argc,&argv);
-
-   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
-   {
-      // Create BlockForest
-      uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
-      auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
-                                           2,2,2,          //cells
-                                           1,              //dx
-                                           false,          //one block per process
-                                           true,true,true);//periodicity
-
-      BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
-              &createGPUField, "ScalarGPUField" );
-
-      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
-      {
-         GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
-         GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
-
-         for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
-         {
-            localTester.test( *dir );
-            bufferTester.test( *dir );
-         }
-      }
-   }
-
-   return 0;
-}
diff --git a/tests/cuda/AlignmentTest.cpp b/tests/gpu/AlignmentTest.cpp
similarity index 93%
rename from tests/cuda/AlignmentTest.cpp
rename to tests/gpu/AlignmentTest.cpp
index 3de12c7628c98da8797d442f6236d7893829788e..12d50e6c085cb505ed1855855b43845cb95e0abc 100644
--- a/tests/cuda/AlignmentTest.cpp
+++ b/tests/gpu/AlignmentTest.cpp
@@ -18,14 +18,14 @@
 //
 //======================================================================================================================
 
-#include "cuda/AlignedAllocation.h"
+#include "gpu/AlignedAllocation.h"
 #include "core/mpi/Environment.h"
 #include "core/debug/TestSubsystem.h"
 #include "core/logging/Logging.h"
 
 
 using namespace walberla;
-using namespace cuda;
+using namespace gpu;
 
 
 int main( int argc, char ** argv )
@@ -39,12 +39,12 @@ int main( int argc, char ** argv )
    size_t alignment = 512;
    size_t offset = 16;
    void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
-   WALBERLA_LOG_INFO("Pitch " << pitch);
+   WALBERLA_LOG_INFO("Pitch " << pitch)
 
    char * cptr = reinterpret_cast<char*>( ptr );
    WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
 
    free_aligned_with_offset( ptr );
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/CMakeLists.txt b/tests/gpu/CMakeLists.txt
similarity index 97%
rename from tests/cuda/CMakeLists.txt
rename to tests/gpu/CMakeLists.txt
index 723f7818710f502a792d0e5732fc0cf70f4bec01..e760cca4db10704cbedfef89a4faca71631ea730 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###################################################################################################
 #
-# Tests for cuda
+# Tests for gpu
 #
 ###################################################################################################
 
@@ -34,7 +34,7 @@ waLBerla_execute_test( NAME CodegenJacobiGPU )
 
 waLBerla_generate_target_from_python(NAME CodegenPoissonGPUGeneratedKernel FILE codegen/CudaPoisson.py
         OUT_FILES PoissonGPU.cu PoissonGPU.h )
-waLBerla_compile_test( FILES codegen/CodegenPoissonGPU.cpp DEPENDS gui cuda timeloop CodegenPoissonGPUGeneratedKernel)
+waLBerla_compile_test( FILES codegen/CodegenPoissonGPU.cpp DEPENDS gui gpu timeloop CodegenPoissonGPUGeneratedKernel)
 waLBerla_execute_test( NAME CodegenPoissonGPU )
 
 # The following tests work only for CUDA enabled MPI
diff --git a/tests/cuda/CudaMPI.cpp b/tests/gpu/CudaMPI.cpp
similarity index 97%
rename from tests/cuda/CudaMPI.cpp
rename to tests/gpu/CudaMPI.cpp
index 56d03807f25da2e62e1616d95cf52abd8d92dd9c..20cee24788dd92888563c16a25b8fdf7bed07e90 100644
--- a/tests/cuda/CudaMPI.cpp
+++ b/tests/gpu/CudaMPI.cpp
@@ -27,7 +27,7 @@
 #include "core/logging/Logging.h"
 #include "core/mpi/Datatype.h"
 
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
 
 #include "field/communication/MPIDatatypes.h"
 #include "field/AddToStorage.h"
@@ -44,7 +44,7 @@ void fullFieldTransfer()
    Field<double,4>  h_f1 ( 3, 4, 2, 42.0, field::fzyx );
    Field<double,4>  h_f2 ( 3, 4, 2, 27.0, field::fzyx );
 
-   cuda::GPUField<double> d_f ( 3, 4, 2, 4, 0, field::fzyx );
+   gpu::GPUField<double> d_f ( 3, 4, 2, 4, 0, field::fzyx );
 
 
    // Transfer h_f1 from CPU to GPU d_f
@@ -94,7 +94,7 @@ void blockStorageAndGui( int argc, char ** argv )
    BlockDataID cpuFieldID1 = field::addToStorage<ScalarField>( blocks, "CPUField 1", real_c(42), field::fzyx, uint_c(1) );
    BlockDataID cpuFieldID2 = field::addToStorage<ScalarField>( blocks, "CPUField 2", real_c(0),  field::fzyx, uint_c(1) );
 
-   typedef cuda::GPUField<real_t> GPUField;
+   typedef gpu::GPUField<real_t> GPUField;
    BlockDataID gpuFieldID = blocks->addStructuredBlockData< GPUField >(
             [&] ( IBlock * block, StructuredBlockStorage * const s ) {
                return new GPUField( s->getNumberOfXCells(*block),
diff --git a/tests/cuda/FieldIndexing3DTest.cpp b/tests/gpu/FieldIndexing3DTest.cpp
similarity index 78%
rename from tests/cuda/FieldIndexing3DTest.cpp
rename to tests/gpu/FieldIndexing3DTest.cpp
index 4ad2622bc3fb112c6ac960840f2af33af79f65c0..82c677070fbb202cdcd3f7fcc77ac08c9e045a46 100644
--- a/tests/cuda/FieldIndexing3DTest.cpp
+++ b/tests/gpu/FieldIndexing3DTest.cpp
@@ -22,23 +22,22 @@
 
 #include "core/debug/TestSubsystem.h"
 #include "core/Environment.h"
-#include "core/mpi/Datatype.h"
 
 #include "field/GhostLayerField.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/Kernel.h"
-#include "cuda/FieldIndexing3D.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/Kernel.h"
+#include "gpu/FieldIndexing3D.h"
 
 #include "FieldIndexing3DTest.h"
 
 using namespace walberla;
 
 
-typedef cuda::FieldIndexing3D<int> FieldIdx3D_T;
-typedef GhostLayerField<int , F_SIZE> HostField_T;
-typedef cuda::GPUField<int> GPUField_T;
+using FieldIdx3D_T = gpu::FieldIndexing3D<int>;
+using HostField_T  = GhostLayerField<int , F_SIZE>;
+using GPUField_T   = gpu::GPUField<int> ;
 
 
 
@@ -46,14 +45,14 @@ void xyzTest()
 {
    const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT );
-   cuda::fieldCpy( deviceField, emptyField );
+   gpu::fieldCpy( deviceField, emptyField );
 
-   auto setValue = cuda::make_kernel( &setValueKernel );
+   auto setValue = gpu::make_kernel( &setValueKernel );
    setValue.addFieldIndexingParam( FieldIdx3D_T::xyz( deviceField ) );
    setValue();
 
    HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
-   cuda::fieldCpy( resultField, deviceField );
+   gpu::fieldCpy( resultField, deviceField );
 
    HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    WALBERLA_FOR_ALL_CELLS_XYZ( &expectedField,
@@ -63,7 +62,7 @@ void xyzTest()
       }
    )
 
-   WALBERLA_ASSERT( resultField == expectedField );
+   WALBERLA_ASSERT( resultField == expectedField )
 }
 
 
@@ -71,14 +70,14 @@ void sliceBeforeGhostLayerXYZTest()
 {
    const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT );
-   cuda::fieldCpy( deviceField, emptyField );
+   gpu::fieldCpy( deviceField, emptyField );
 
-   auto setValue = cuda::make_kernel( &setValueKernel );
+   auto setValue = gpu::make_kernel( &setValueKernel );
    setValue.addFieldIndexingParam( FieldIdx3D_T::sliceBeforeGhostLayerXYZ( deviceField, 1, stencil::B, true ) );
    setValue();
 
    HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
-   cuda::fieldCpy( resultField, deviceField );
+   gpu::fieldCpy( resultField, deviceField );
 
    HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    CellInterval ci;
@@ -89,17 +88,17 @@ void sliceBeforeGhostLayerXYZTest()
          expectedField.get( x, y, z, f ) = IDX4D( x - ci.xMin(), y - ci.yMin(), z - ci.zMin(), f );
       }
    )
-   WALBERLA_ASSERT( resultField == expectedField );
+   WALBERLA_ASSERT( resultField == expectedField )
 }
 
 
 int main( int argc, char ** argv )
 {
    debug::enterTestMode();
-   walberla::Environment walberlaEnv( argc, argv );
+   walberla::Environment const walberlaEnv( argc, argv );
 
    xyzTest();
    sliceBeforeGhostLayerXYZTest();
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/FieldIndexing3DTest.cu b/tests/gpu/FieldIndexing3DTest.cu
similarity index 87%
rename from tests/cuda/FieldIndexing3DTest.cu
rename to tests/gpu/FieldIndexing3DTest.cu
index 7ade5bfc6a4149ffb65255c1e7a99aa9e159b72d..edbec01be8150c298aa0fe2a3b703d5156bb867b 100644
--- a/tests/cuda/FieldIndexing3DTest.cu
+++ b/tests/gpu/FieldIndexing3DTest.cu
@@ -25,9 +25,9 @@ namespace walberla {
 
 __global__ void setValueKernel( FieldAccessor3D_T fa )
 {
-   unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-   unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-   unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+   unsigned int const x = blockIdx.x * blockDim.x + threadIdx.x;
+   unsigned int const y = blockIdx.y * blockDim.y + threadIdx.y;
+   unsigned int const z = blockIdx.z * blockDim.z + threadIdx.z;
    fa.set( blockIdx, threadIdx );
 
    if ( fa.isValidPosition() )
diff --git a/tests/cuda/FieldIndexing3DTest.h b/tests/gpu/FieldIndexing3DTest.h
similarity index 94%
rename from tests/cuda/FieldIndexing3DTest.h
rename to tests/gpu/FieldIndexing3DTest.h
index 80e1b6cfea60e65d490d1e6c65fc03a0f62660f5..54dcc86053a7d4c8dce079704e6a9926064e3c98 100644
--- a/tests/cuda/FieldIndexing3DTest.h
+++ b/tests/gpu/FieldIndexing3DTest.h
@@ -20,7 +20,7 @@
 
 #pragma once
 
-#include "cuda/FieldAccessor3D.h"
+#include "gpu/FieldAccessor3D.h"
 
 #define X_SIZE    (64-2)
 #define Y_SIZE    (64-2)
@@ -37,7 +37,7 @@
 
 namespace walberla {
 
-typedef cuda::FieldAccessor3D<int> FieldAccessor3D_T;
+using FieldAccessor3D_T = gpu::FieldAccessor3D<int>;
 
 __global__ void setValueKernel( FieldAccessor3D_T fa );
 
diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/gpu/FieldTransferTest.cpp
similarity index 75%
rename from tests/cuda/FieldTransferTest.cpp
rename to tests/gpu/FieldTransferTest.cpp
index 7a41330a23ab24e7aaeb2055efe9155dc3aa4ca2..c8f5126adf99827aa723c9ec8e7ab45ee6a03185 100644
--- a/tests/cuda/FieldTransferTest.cpp
+++ b/tests/gpu/FieldTransferTest.cpp
@@ -24,8 +24,8 @@
 
 #include "field/Field.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
 #include "core/math/Random.h"
 
 
@@ -40,19 +40,18 @@ void simpleTransfer()
       h_f1(x, y, z, 0) = math::realRandom<double>();
    )
 
-   cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
+   gpu::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
 
-   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
-   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
-   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
-   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
-   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
+   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize())
+   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize())
+   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize())
+   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize())
+   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout())
 
+   gpu::fieldCpy( d_f, h_f1 );
+   gpu::fieldCpy( h_f2, d_f );
 
-   cuda::fieldCpy( d_f, h_f1 );
-   cuda::fieldCpy( h_f2, d_f );
-
-   WALBERLA_CHECK_EQUAL( h_f1, h_f2 );
+   WALBERLA_CHECK_EQUAL( h_f1, h_f2 )
 }
 
 
@@ -63,5 +62,5 @@ int main( int argc, char **argv )
 
    simpleTransfer();
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/Kernels.cu b/tests/gpu/Kernels.cu
similarity index 57%
rename from tests/cuda/Kernels.cu
rename to tests/gpu/Kernels.cu
index e0d4c2f9762218960b4ffe6e9285f3a4fc9f2984..daefdb58f559db4b94daa7111c2156a0c7ee4a6e 100644
--- a/tests/cuda/Kernels.cu
+++ b/tests/gpu/Kernels.cu
@@ -1,14 +1,15 @@
-#include "cuda/FieldAccessor.h"
+#include "gpu/FieldAccessor.h"
 
 namespace walberla {
 
 
-namespace cuda {
+namespace gpu
+{
    template<typename T>
    class GPUField;
 }
 
-__global__ void kernel_double( cuda::FieldAccessor<double> f )
+__global__ void kernel_double(gpu::FieldAccessor<double> f )
 {
    f.set( blockIdx, threadIdx );
    f.get() *= 2.0;
diff --git a/tests/cuda/SimpleKernelTest.cpp b/tests/gpu/SimpleKernelTest.cpp
similarity index 71%
rename from tests/cuda/SimpleKernelTest.cpp
rename to tests/gpu/SimpleKernelTest.cpp
index f2f9a2a8b4ceb2cec7f032d4e732938d281ef63b..a4fd1db371912840bbca12eadda20b942588b5af 100644
--- a/tests/cuda/SimpleKernelTest.cpp
+++ b/tests/gpu/SimpleKernelTest.cpp
@@ -18,7 +18,6 @@
 //
 //======================================================================================================================
 
-#include "cuda/FieldIndexing.h"
 #include "blockforest/Initialization.h"
 
 #include "core/debug/TestSubsystem.h"
@@ -26,17 +25,16 @@
 
 #include "field/GhostLayerField.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/Kernel.h"
-#include "gui/Gui.h"
-#include "timeloop/SweepTimeloop.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/Kernel.h"
 
 using namespace walberla;
 
 namespace walberla{
 
-void kernel_double( cuda::FieldAccessor<double> f );
+void kernel_double(gpu::FieldAccessor<double> f );
 }
 
 GhostLayerField<double,1> * createCPUField( IBlock* const block, StructuredBlockStorage* const storage )
@@ -50,9 +48,9 @@ GhostLayerField<double,1> * createCPUField( IBlock* const block, StructuredBlock
       field::fzyx);
 }
 
-cuda::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
+gpu::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
 {
-   return new cuda::GPUField<double> (
+   return new gpu::GPUField<double> (
       storage->getNumberOfXCells( *block ), // number of cells in x direction
       storage->getNumberOfYCells( *block ), // number of cells in y direction
       storage->getNumberOfZCells( *block ), // number of cells in z direction
@@ -64,10 +62,10 @@ cuda::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockSto
 
 int main( int argc, char ** argv )
 {
-   walberla::Environment env( argc, argv );
+   walberla::Environment const env( argc, argv );
    debug::enterTestMode();
 
-   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+   shared_ptr< StructuredBlockForest > const blocks = blockforest::createUniformBlockGrid (
       uint_t(1),   uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
       uint_t(14),  uint_t(14), uint_t(14), // how many cells per block (x,y,z)
       real_c(0.5),                         // dx: length of one cell in physical coordinates
@@ -76,10 +74,10 @@ int main( int argc, char ** argv )
 
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData< GhostLayerField<double,1> > ( &createCPUField, "CPUField" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData< GhostLayerField<double,1> > ( &createCPUField, "CPUField" );
 
 
-   BlockDataID gpuFieldID = blocks->addStructuredBlockData< cuda::GPUField<double>    > ( &createGPUField, "GPUField" );
+   BlockDataID const gpuFieldID = blocks->addStructuredBlockData< gpu::GPUField<double>    > ( &createGPUField, "GPUField" );
 
    for ( auto blockIterator = blocks->begin(); blockIterator != blocks->end(); ++blockIterator )
    {
@@ -87,26 +85,19 @@ int main( int argc, char ** argv )
 
       // get the field stored on the current block
       auto cpuField = currentBlock.getData< GhostLayerField<double,1> > ( cpuFieldID );
-      auto gpuField = currentBlock.getData< cuda::GPUField<double>    > ( gpuFieldID );
+      auto gpuField = currentBlock.getData< gpu::GPUField<double>    > ( gpuFieldID );
 
-      cuda::fieldCpy( *gpuField, *cpuField );
+      gpu::fieldCpy( *gpuField, *cpuField );
 
-      auto myKernel = cuda::make_kernel( &kernel_double );
-      auto indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( *gpuField, 1, stencil::W, true );
+      auto myKernel = gpu::make_kernel( &kernel_double );
+      auto indexing = gpu::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( *gpuField, 1, stencil::W, true );
       myKernel.addFieldIndexingParam(indexing);
       myKernel();
 
-      cuda::fieldCpy( *cpuField, *gpuField );
+      gpu::fieldCpy( *cpuField, *gpuField );
 
       WALBERLA_ASSERT_FLOAT_EQUAL( cpuField->get(0,0,0), real_t(2) )
    }
 
-
-   //SweepTimeloop timeloop ( blocks, uint_t(1) );
-   //timeloop.run();
-   //GUI gui ( timeloop, blocks, argc, argv );
-   //gui.run();
-
-
    return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/codegen/CodegenJacobiGPU.cpp b/tests/gpu/codegen/CodegenJacobiGPU.cpp
similarity index 71%
rename from tests/cuda/codegen/CodegenJacobiGPU.cpp
rename to tests/gpu/codegen/CodegenJacobiGPU.cpp
index 93814e0a51bbff7398a57fd3e3130ce751549121..67e43894359123c9228819f48bfb972b20cf6e88 100644
--- a/tests/cuda/codegen/CodegenJacobiGPU.cpp
+++ b/tests/gpu/codegen/CodegenJacobiGPU.cpp
@@ -21,7 +21,7 @@
 #include "CudaJacobiKernel2D.h"
 #include "CudaJacobiKernel3D.h"
 
-#include "cuda/HostFieldAllocator.h"
+#include "gpu/HostFieldAllocator.h"
 #include "blockforest/Initialization.h"
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
@@ -29,11 +29,11 @@
 #include "core/Environment.h"
 #include "core/debug/TestSubsystem.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/FieldIndexing.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -49,27 +49,28 @@
 
 using namespace walberla;
 
-typedef GhostLayerField<real_t, 1> ScalarField;
-typedef cuda::GPUField<real_t> GPUField;
+using ScalarField = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
 
 
 ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
 {
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            double(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<real_t> >()  // allocator for host pinned memory
-            );
+   auto xSize = storage->getNumberOfXCells( *block );
+   auto ySize = storage->getNumberOfYCells( *block );
+   auto zSize = storage->getNumberOfZCells( *block );
+   auto numberOfGhostLayers = uint_c(1);
+   auto initialValue = real_c(0);
+   auto fieldLayout = field::fzyx;
+   return new ScalarField (xSize, ySize, zSize,
+                          numberOfGhostLayers, initialValue, fieldLayout,
+                          make_shared< gpu::HostFieldAllocator<real_t> >()  // allocator for host pinned memory
+   );
 }
 
 void testJacobi2D()
 {
-   uint_t xSize = 20;
-   uint_t ySize = 20;
+   uint_t const xSize = 20;
+   uint_t const ySize = 20;
 
    // Create blocks
    shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
@@ -80,8 +81,8 @@ void testJacobi2D()
            true, true, true );                 // no periodicity
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID const gpuField = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
 
    // Initialize a quarter of the field with ones, the rest remains 0
    // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
@@ -93,8 +94,8 @@ void testJacobi2D()
             f->get( x, y, 0 ) = real_c(1.0);
    }
 
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+   using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D2Q9>;
+   using Packing = gpu::communication::GPUPackInfo<GPUField> ;
 
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
@@ -107,10 +108,9 @@ void testJacobi2D()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::CudaJacobiKernel2D(gpuField), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
    timeloop.run();
-   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( cpuFieldID );
@@ -120,9 +120,9 @@ void testJacobi2D()
 
 void testJacobi3D()
 {
-   uint_t xSize = 12;
-   uint_t ySize = 12;
-   uint_t zSize = 12;
+   uint_t const xSize = 12;
+   uint_t const ySize = 12;
+   uint_t const zSize = 12;
 
    // Create blocks
    shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
@@ -133,8 +133,8 @@ void testJacobi3D()
            true, true, true );                 // no periodicity
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID const gpuField = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
 
    // Initialize a quarter of the field with ones, the rest remains 0
    // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
@@ -147,8 +147,8 @@ void testJacobi3D()
                f->get( x, y, z ) = real_c(1.0);
    }
 
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D3Q7> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+   using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D3Q7>;
+   using Packing = gpu::communication::GPUPackInfo<GPUField>;
 
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
@@ -161,10 +161,9 @@ void testJacobi3D()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::CudaJacobiKernel3D(gpuField), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
    timeloop.run();
-   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( cpuFieldID );
@@ -173,7 +172,7 @@ void testJacobi3D()
 
 int main( int argc, char ** argv )
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment const env( argc, argv );
    debug::enterTestMode();
 
    testJacobi2D();
diff --git a/tests/cuda/codegen/CodegenPoissonGPU.cpp b/tests/gpu/codegen/CodegenPoissonGPU.cpp
similarity index 83%
rename from tests/cuda/codegen/CodegenPoissonGPU.cpp
rename to tests/gpu/codegen/CodegenPoissonGPU.cpp
index ef5ae96c00fef1a20c52dab51a1419539f2fd5a4..ece41a7346ca47c52625b6744790ce79110ecf61 100644
--- a/tests/cuda/codegen/CodegenPoissonGPU.cpp
+++ b/tests/gpu/codegen/CodegenPoissonGPU.cpp
@@ -28,11 +28,11 @@
 #include "core/debug/TestSubsystem.h"
 #include "core/math/Constants.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/FieldIndexing.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -47,8 +47,11 @@
 
 using namespace walberla;
 
-typedef GhostLayerField<real_t, 1> ScalarField_T;
-typedef cuda::GPUField<real_t> GPUField;
+using ScalarField_T = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
+
+using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D2Q9>;
+using Packing = gpu::communication::GPUPackInfo<GPUField>;
 
 
 // U with Dirichlet Boundary
@@ -104,17 +107,13 @@ void testPoisson()
 
 
    BlockDataID cpuFieldID = field::addToStorage< ScalarField_T >( blocks, "CPU Field src", real_c(0.0) );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField_T>( blocks, cpuFieldID, "GPU Field src" );
+   BlockDataID gpuField = gpu::addGPUFieldToStorage<ScalarField_T>( blocks, cpuFieldID, "GPU Field src" );
    initU( blocks, cpuFieldID );
 
    BlockDataID cpufId = field::addToStorage< ScalarField_T >( blocks, "CPU Field f", real_c(0.0));
-   BlockDataID gpufId = cuda::addGPUFieldToStorage<ScalarField_T>( blocks, cpufId, "GPU Field f" );
+   BlockDataID gpufId = gpu::addGPUFieldToStorage<ScalarField_T>( blocks, cpufId, "GPU Field f" );
    initF( blocks, cpufId );
 
-
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
-
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
 
@@ -126,11 +125,10 @@ void testPoisson()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::PoissonGPU(gpufId, gpuField, dx, dy), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField_T>( blocks, gpuField, cpuFieldID );
-   cuda::fieldCpy<GPUField, ScalarField_T>( blocks, gpufId, cpufId );
+   gpu::fieldCpy<GPUField, ScalarField_T>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField_T>( blocks, gpufId, cpufId );
    timeloop.run();
-   cuda::fieldCpy<ScalarField_T, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField_T, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField_T>( cpuFieldID );
@@ -140,7 +138,7 @@ void testPoisson()
 
 int main( int argc, char ** argv )
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment const env( argc, argv );
    debug::enterTestMode();
 
    testPoisson();
diff --git a/tests/cuda/codegen/CudaJacobiKernel.py b/tests/gpu/codegen/CudaJacobiKernel.py
similarity index 100%
rename from tests/cuda/codegen/CudaJacobiKernel.py
rename to tests/gpu/codegen/CudaJacobiKernel.py
diff --git a/tests/cuda/codegen/CudaPoisson.py b/tests/gpu/codegen/CudaPoisson.py
similarity index 100%
rename from tests/cuda/codegen/CudaPoisson.py
rename to tests/gpu/codegen/CudaPoisson.py
diff --git a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
similarity index 86%
rename from tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp
rename to tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
index 47c7d6741819e8f71cb2f348e1d21c28a90f1cfc..4360b66e97cc65176b79bc215f3b73f099f2160d 100644
--- a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp
+++ b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
@@ -28,8 +28,8 @@
 #include "core/debug/TestSubsystem.h"
 #include "core/Environment.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/UniformGPUScheme.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/communication/UniformGPUScheme.h"
 
 #include "stencil/D3Q27.h"
 
@@ -42,9 +42,9 @@ namespace walberla {
 
 using Stencil_T = stencil::D3Q27;
 
-cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) {
+gpu::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) {
 
-   return new cuda::GPUField<int> (
+   return new gpu::GPUField<int> (
       storage->getNumberOfXCells( *block ), // number of cells in x direction
       storage->getNumberOfYCells( *block ), // number of cells in y direction
       storage->getNumberOfZCells( *block ), // number of cells in z direction
@@ -54,19 +54,18 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
 
 }
 
-cuda::GPUField<int> * createSmallGPUField( IBlock * const , StructuredBlockStorage * const ) {
-   return new cuda::GPUField<int> (2, 2, 2, 1, 1, field::fzyx );
+gpu::GPUField<int> * createSmallGPUField( IBlock * const , StructuredBlockStorage * const ) {
+   return new gpu::GPUField<int> (2, 2, 2, 1, 1, field::fzyx );
 }
 
 
 void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) {
-
-   cuda::communication::UniformGPUScheme< Stencil_T > us{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us{ sbf };
    us.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId));
 
    for( auto & block : *sbf ) {
 
-      auto & gpuField = *(block.getData< cuda::GPUField< int > >(gpuFieldId));
+      auto & gpuField = *(block.getData< gpu::GPUField< int > >(gpuFieldId));
 
       field::GhostLayerField< int, 1 > cpuField(gpuField.xSize(), gpuField.ySize(), gpuField.zSize(), 1, 0,
                                                 field::fzyx);
@@ -82,12 +81,12 @@ void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf,
       cpuField(1, 0, 0) = 3;
       cpuField(1, 1, 0) = 4;
 
-      cuda::fieldCpy(gpuField, cpuField);
+      gpu::fieldCpy(gpuField, cpuField);
 
       // communicate
       us.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, +2), 1)
       WALBERLA_CHECK_EQUAL(cpuField(0, 1, +2), 2)
@@ -98,16 +97,15 @@ void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf,
 }
 
 void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) {
-
-   cuda::communication::UniformGPUScheme< Stencil_T > us1{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us1{ sbf };
    us1.addPackInfo(std::make_shared< pystencils::ScalarFieldPullReductionGPU >(gpuFieldId));
 
-   cuda::communication::UniformGPUScheme< Stencil_T > us2{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us2{ sbf };
    us2.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId));
 
    for( auto & block : *sbf ) {
 
-      auto& gpuField = *(block.getData< cuda::GPUField< int > >(gpuFieldId));
+      auto& gpuField = *(block.getData< gpu::GPUField< int > >(gpuFieldId));
 
       field::GhostLayerField< int, 1 > cpuField(gpuField.xSize(), gpuField.ySize(), gpuField.zSize(), 1, 0,
                                                 field::fzyx);
@@ -129,12 +127,12 @@ void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockF
       cpuField(1, 0, 1) = 1;
       cpuField(1, 1, 1) = 1;
 
-      cuda::fieldCpy(gpuField, cpuField);
+      gpu::fieldCpy(gpuField, cpuField);
 
       // communicate pull += reduction
       us1.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       // check values in top ghost layer
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, 2), 0)
@@ -151,7 +149,7 @@ void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockF
       // communicate to sync ghost layers
       us2.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       // check values in bottom ghost layer
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, -1), 2)
@@ -184,7 +182,7 @@ int main(int argc, char **argv) {
                                         true,true,true); //periodicity
 
    // Create a Field with the same number of cells as the block
-   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> > ( &createGPUField, "ScalarGPUField" );
+   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData< gpu::GPUField<int> > ( &createGPUField, "ScalarGPUField" );
 
    testScalarField( blocks, scalarGPUFieldId );
 
@@ -196,7 +194,7 @@ int main(int argc, char **argv) {
                                    true,true,true);//periodicity
 
    // Create a Field with one quarter as many cells per dimension, i.e. a field with the same size as the one above
-   scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> > ( &createSmallGPUField, "ScalarGPUField" );
+   scalarGPUFieldId = blocks->addStructuredBlockData< gpu::GPUField<int> > ( &createSmallGPUField, "ScalarGPUField" );
 
    testScalarField( blocks, scalarGPUFieldId );
 
diff --git a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.py b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.py
similarity index 100%
rename from tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.py
rename to tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.py
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp b/tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
similarity index 83%
rename from tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
rename to tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
index 8b03e0277f4ff892d1b948555a8c4af08db3bba0..43b5864062dee2be11bbcd3bc4cb7cfea349c56b 100644
--- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
+++ b/tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
@@ -25,9 +25,9 @@
 
 #include "field/Field.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/AddGPUFieldToStorage.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/AddGPUFieldToStorage.h"
 
 #include "MicroBenchmarkCopyKernel.h"
 #include "MicroBenchmarkStreamKernel.h"
@@ -44,10 +44,10 @@ int main( int argc, char **argv )
    shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
            128u, 128u, 128u, 1.0, false, false, false, false);
 
-   BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<real_t> >(blocks, "src", 19, field::fzyx, 1);
-   BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<real_t> >(blocks, "dst", 19, field::fzyx, 1);
+   BlockDataID srcID = gpu::addGPUFieldToStorage< gpu::GPUField<real_t> >(blocks, "src", 19, field::fzyx, 1);
+   BlockDataID dstID = gpu::addGPUFieldToStorage< gpu::GPUField<real_t> >(blocks, "dst", 19, field::fzyx, 1);
 
-   int iterations = 3;
+   int const iterations = 3;
 
    pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
    for( int i=0 ; i < iterations; ++i )
@@ -60,7 +60,7 @@ int main( int argc, char **argv )
       for( auto &block: *blocks )
          stream( &block );
 
-   WALBERLA_CUDA_CHECK(cudaDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
    return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/gpu/codegen/MicroBenchmarkGpuLbm.py
similarity index 100%
rename from tests/cuda/codegen/MicroBenchmarkGpuLbm.py
rename to tests/gpu/codegen/MicroBenchmarkGpuLbm.py
diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bc87aa13f9a2c72351b532fb20e3614cdfc8d82
--- /dev/null
+++ b/tests/gpu/communication/CommTest.cpp
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//
+//======================================================================================================================
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Datatype.h"
+
+#include "field/Field.h"
+#include "field/communication/MPIDatatypes.h"
+
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+
+#define NUM_ITER 100
+#define SIZE_X 16
+#define SIZE_Y 16
+#define SIZE_Z 16
+#define LAYOUT field::fzyx
+
+using namespace walberla;
+
+void hostToHost()
+{
+   Field< double, 1 > const hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0, LAYOUT);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      hostField2.set(hostField1);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void hostToDevice()
+{
+   Field< double, 1 > const hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      gpu::fieldCpy(deviceField, hostField);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void deviceToHost()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+   gpu::fieldCpy(deviceField, hostField);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      gpu::fieldCpy(hostField, deviceField);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiHostToHost()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiHostToDevice()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype   = mpi::Datatype(field::communication::mpiDatatype(hostField));
+   auto deviceDatatype = mpi::Datatype(field::communication::mpiDatatype(deviceField));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiDeviceToHost()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype   = mpi::Datatype(field::communication::mpiDatatype(hostField));
+   auto deviceDatatype = mpi::Datatype(field::communication::mpiDatatype(deviceField));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiDeviceToDevice()
+{
+   gpu::GPUField< double > deviceField1(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+   gpu::GPUField< double > deviceField2(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto deviceDatatype1 = mpi::Datatype(field::communication::mpiDatatype(deviceField1));
+   auto deviceDatatype2 = mpi::Datatype(field::communication::mpiDatatype(deviceField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(deviceField1.data(), 1, deviceDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(deviceField2.data(), 1, deviceDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiCopyHostToDevice()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+
+      gpu::fieldCpy(deviceField, hostField2);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiCopyDeviceToHost()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+   gpu::GPUField< double > const deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      gpu::fieldCpy(hostField1, deviceField);
+
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   walberla::Environment const walberlaEnv(argc, argv);
+
+   WALBERLA_CHECK_EQUAL(MPIManager::instance()->numProcesses(), 2)
+
+   hostToHost();
+   hostToDevice();
+   deviceToHost();
+   mpiHostToHost();
+   mpiHostToDevice();
+   mpiDeviceToHost();
+   mpiDeviceToDevice();
+   mpiCopyHostToDevice();
+   mpiCopyDeviceToHost();
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
similarity index 87%
rename from tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp
rename to tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
index d4f5f140551ffaf9a995ab0dfe257fb89a19a188..d70ecf5a35e5b0773ae99f0f7a6a520f8c01b9bd 100644
--- a/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp
+++ b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
@@ -24,31 +24,31 @@
 #include <blockforest/Initialization.h>
 #include <blockforest/SetupBlockForest.h>
 #include <core/DataTypes.h>
+#include <core/Environment.h>
 #include <core/debug/TestSubsystem.h>
 #include <core/math/Random.h>
-#include <core/Environment.h>
-#include <cuda/AddGPUFieldToStorage.h>
-#include <cuda/ErrorChecking.h>
-#include <cuda/FieldCopy.h>
-#include <cuda/GPUField.h>
-#include <cuda/communication/MemcpyPackInfo.h>
-#include <cuda/communication/UniformGPUScheme.h>
-#include <cuda_runtime.h>
 #include <domain_decomposition/BlockDataID.h>
 #include <field/AddToStorage.h>
 #include <field/GhostLayerField.h>
+#include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/FieldCopy.h>
+#include <gpu/GPUField.h>
+#include <gpu/communication/MemcpyPackInfo.h>
+#include <gpu/communication/UniformGPUScheme.h>
 #include <stencil/D3Q27.h>
 #include <stencil/Directions.h>
 #include <stencil/Iterator.h>
 #include <vector>
 
+#include "gpu/GPUWrapper.h"
+
 namespace walberla
 {
 using Type_T = int;
 
 using Stencil_T        = stencil::D3Q27;
 using ScalarField_T    = field::GhostLayerField< Type_T, 1 >;
-using GPUScalarField_T = cuda::GPUField< Type_T >;
+using GPUScalarField_T = gpu::GPUField< Type_T >;
 
 const Set< SUID > requiredBlockSelector("communication");
 const Set< SUID > incompatibleBlockSelector("no communication");
@@ -103,7 +103,7 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
 
    sforest.addWorkloadMemorySUIDAssignmentFunction(suidAssignmentFunction);
 
-   AABB domainAABB{ real_c(0), real_c(0), real_c(0),
+   AABB const domainAABB{ real_c(0), real_c(0), real_c(0),
                     dx * real_c( numberOfXBlocks * numberOfXCellsPerBlock ),
                     dx * real_c( numberOfYBlocks * numberOfYCellsPerBlock ),
                     dx * real_c( numberOfZBlocks * numberOfZCellsPerBlock ) };
@@ -113,7 +113,7 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
 
    const memory_t memoryLimit = numeric_cast< memory_t >(sforest.getNumberOfBlocks());
 
-   blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+   blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > const metisConfig(
       true, false,
       std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, numberOfXCellsPerBlock,
                 numberOfYCellsPerBlock, numberOfZCellsPerBlock));
@@ -138,7 +138,7 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   walberla::Environment walberlaEnv(argc, argv);
+   walberla::Environment const walberlaEnv(argc, argv);
 
    const Vector3<uint_t> nBlocks { 3, 1, 1 };
    const Vector3<uint_t> cells { 2, 2, 1 };
@@ -150,20 +150,20 @@ int main(int argc, char** argv)
    auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2],
                                          cells[0], cells[1], cells[2], 1, false, true, true, true);
 
-   BlockDataID fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1));
+   BlockDataID const fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1));
    initScalarField(blocks, fieldID);
 
-   BlockDataID gpuFieldID = cuda::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar");
+   BlockDataID const gpuFieldID = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar");
 
    // Setup communication schemes for GPUPackInfo
-   cuda::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector);
-   communication.addPackInfo(std::make_shared< cuda::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID));
+   gpu::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector);
+   communication.addPackInfo(std::make_shared< gpu::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID));
 
    // Perform one communication step
    communication();
 
    // Copy to CPU
-   cuda::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID );
+   gpu::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID );
 
    // Check for correct data in ghost layers of middle block
    auto middleBlock = blocks->getBlock( domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2) );
diff --git a/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66a4d3c74da29b7779783132a3d8f3cce5a08287
--- /dev/null
+++ b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp
@@ -0,0 +1,166 @@
+//========================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup cuda
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple
+//! streams.
+//
+//========================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/Random.h"
+#include "core/mpi/Environment.h"
+
+#include "domain_decomposition/BlockDataID.h"
+
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+#include "stencil/Iterator.h"
+
+#include <cuda_runtime.h>
+#include <vector>
+
+#include "gpu/ErrorChecking.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/communication/GPUPackInfo.h"
+
+using namespace walberla;
+
+using DataType        = walberla::uint_t;
+using StencilType     = stencil::D3Q27;
+using FieldType       = field::GhostLayerField< DataType, StencilType::Size >;
+using GPUFieldType    = gpu::GPUField< DataType >;
+using CommSchemeType  = blockforest::communication::UniformBufferedScheme< StencilType >;
+using GPUPackInfoType = gpu::communication::GPUPackInfo< GPUFieldType >;
+
+static std::vector< gpu::Layout > fieldLayouts = { gpu::fzyx, gpu::zyxf };
+static uint_t fieldLayoutIndex                 = 0;
+
+FieldType* createField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new FieldType(storage->getNumberOfXCells(*block),                  // number of cells in x direction per block
+                        storage->getNumberOfYCells(*block),                  // number of cells in y direction per block
+                        storage->getNumberOfZCells(*block),                  // number of cells in z direction per block
+                        1,                                                   // one ghost layer
+                        DataType(0),                                         // initial value
+                        fieldLayouts[fieldLayoutIndex],                      // layout
+                        make_shared< gpu::HostFieldAllocator< DataType > >() // allocator for host pinned memory
+   );
+}
+
+GPUFieldType* createGPUField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new GPUFieldType(storage->getNumberOfXCells(*block), // number of cells in x direction
+                           storage->getNumberOfYCells(*block), // number of cells in y direction
+                           storage->getNumberOfZCells(*block), // number of cells in z direction
+                           StencilType::Size,                  // number of cells for pdfs
+                           1,                                  // one ghost layer
+                           fieldLayouts[fieldLayoutIndex]);
+}
+
+void initFields(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID& fieldID)
+{
+   for (auto block = blocks->begin(); block != blocks->end(); ++block)
+   {
+      auto fieldPtr = block->getData< FieldType >(fieldID);
+
+      for (auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt)
+         *fieldIt = math::intRandom< DataType >();
+   }
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   mpi::Environment mpiEnv(argc, argv);
+
+   const Vector3< uint_t > cells = Vector3< uint_t >(4, 4, 4);
+
+   uint_t nProc = uint_c(MPIManager::instance()->numProcesses());
+
+   for (; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex)
+   {
+      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
+                                                        cells[0], cells[1], cells[2], // cells
+                                                        1,                            // unit cell spacing
+                                                        true,                         // one block per process
+                                                        true, true, true);            // periodic in all directions
+
+      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >(&createField, "ScalarField");
+
+      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >(&createGPUField, "syncGPUField");
+
+      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >(&createGPUField, "asyncGPUField");
+
+      math::seedRandomGenerator(numeric_cast< std::mt19937::result_type >(MPIManager::instance()->rank()));
+      // Initialize CPU field with random values
+      initFields(blocks, sourceFieldId);
+
+      // Copy same CPU field to both GPU fields
+      for (auto block = blocks->begin(); block != blocks->end(); ++block)
+      {
+         auto sourceFieldPtr = block->getData< FieldType >(sourceFieldId);
+
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >(syncGPUFieldId);
+         gpu::fieldCpy(*syncGPUFieldPtr, *sourceFieldPtr);
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >(asyncGPUFieldId);
+         gpu::fieldCpy(*asyncGPUFieldPtr, *sourceFieldPtr);
+      }
+
+      // Setup communication schemes for synchronous GPUPackInfo
+      CommSchemeType syncCommScheme(blocks);
+      syncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(syncGPUFieldId));
+
+      // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
+      CommSchemeType asyncCommScheme(blocks);
+      asyncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(asyncGPUFieldId));
+
+      // Perform one communication step for each scheme
+      syncCommScheme();
+      asyncCommScheme();
+
+      // Check results
+      FieldType syncFieldCpu(cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                             make_shared< gpu::HostFieldAllocator< DataType > >());
+      FieldType asyncFieldCpu(cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                              make_shared< gpu::HostFieldAllocator< DataType > >());
+
+      for (auto block = blocks->begin(); block != blocks->end(); ++block)
+      {
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >(syncGPUFieldId);
+         gpu::fieldCpy(syncFieldCpu, *syncGPUFieldPtr);
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >(asyncGPUFieldId);
+         gpu::fieldCpy(asyncFieldCpu, *asyncGPUFieldPtr);
+
+         for (auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
+              syncIt != syncFieldCpu.end(); ++syncIt, ++asyncIt)
+            WALBERLA_CHECK_EQUAL(*syncIt, *asyncIt)
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/gpu/communication/GPUPackInfoTest.cpp b/tests/gpu/communication/GPUPackInfoTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fec15a605a230c59f96abc3e31e8160992000338
--- /dev/null
+++ b/tests/gpu/communication/GPUPackInfoTest.cpp
@@ -0,0 +1,177 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup cuda
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \brief Tests if a GPUField is correctly packed into buffers
+//
+//======================================================================================================================
+
+#include "gpu/communication/GPUPackInfo.h"
+
+#include "blockforest/Initialization.h"
+
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/MPIManager.h"
+
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q27.h"
+
+#include <cstring>
+#include <vector>
+
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+
+#define F_SIZE 19
+
+using namespace walberla;
+
+static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
+static uint_t fieldLayoutIndex                   = 0;
+
+gpu::GPUField< int >* createGPUField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new gpu::GPUField< int >(storage->getNumberOfXCells(*block), // number of cells in x direction
+                                   storage->getNumberOfYCells(*block), // number of cells in y direction
+                                   storage->getNumberOfZCells(*block), // number of cells in z direction
+                                   F_SIZE,                             // fSize
+                                   1,                                  // number of ghost layers
+                                   fieldLayouts[fieldLayoutIndex]);
+}
+
+// Tester base class. The communicate() template method allows testing different communication methods.
+class GPUPackInfoTester
+{
+ public:
+   using GPUPackInfoType = gpu::communication::GPUPackInfo< gpu::GPUField< int > >;
+
+   GPUPackInfoTester(IBlock* block, BlockDataID fieldId) : block_(block), fieldId_(fieldId) {}
+
+   virtual ~GPUPackInfoTester() = default;
+
+   void test(stencil::Direction dir)
+   {
+      gpu::GPUField< int >& gpuField = *(block_->getData< gpu::GPUField< int > >(fieldId_));
+
+      field::GhostLayerField< int, F_SIZE > cpuField(gpuField.xSize(), // number of cells in x direction
+                                                     gpuField.ySize(), // number of cells in y direction
+                                                     gpuField.zSize(), // number of cells in z direction
+                                                     1,                // number of ghost layers
+                                                     0,                // initial value
+                                                     fieldLayouts[fieldLayoutIndex]);
+      cpuField.setWithGhostLayer(0);
+
+      int val = 0;
+      for (auto it = cpuField.beginSliceBeforeGhostLayer(dir); it != cpuField.end(); ++it)
+      {
+         *it = ++val;
+      }
+      gpu::fieldCpy(gpuField, cpuField);
+
+      GPUPackInfoType gpuPackInfo(fieldId_);
+
+      communicate(gpuPackInfo, dir);
+      gpu::fieldCpy(cpuField, gpuField);
+
+      val = 0;
+      for (auto it = cpuField.beginGhostLayerOnly(stencil::inverseDir[dir]); it != cpuField.end(); ++it)
+      {
+         WALBERLA_CHECK_EQUAL(*it, ++val)
+      }
+   }
+
+ protected:
+   virtual void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) = 0;
+
+   IBlock* block_;
+   BlockDataID fieldId_;
+};
+
+// Tester for buffer communication
+class GPUPackInfoBufferTester : public GPUPackInfoTester
+{
+ public:
+   GPUPackInfoBufferTester(IBlock* block, BlockDataID fieldId) : GPUPackInfoTester(block, fieldId) {}
+
+ protected:
+   void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) override
+   {
+      mpi::GenericSendBuffer<> sendBuf;
+      sendBuf.addDebugMarker("Be");
+      gpuPackInfo.packData(block_, dir, sendBuf);
+      sendBuf.addDebugMarker("Af");
+
+      // Manually copy over the send to the receive buffer
+      mpi::GenericRecvBuffer<> recvBuf;
+      recvBuf.resize(sendBuf.size());
+      memcpy(recvBuf.ptr(), sendBuf.ptr(), sendBuf.size() * sizeof(mpi::GenericSendBuffer<>::ElementType));
+
+      recvBuf.readDebugMarker("Be");
+      gpuPackInfo.unpackData(block_, stencil::inverseDir[dir], recvBuf);
+      recvBuf.readDebugMarker("Af");
+   }
+};
+
+// Tester for local communication
+class GPUPackInfoLocalTester : public GPUPackInfoTester
+{
+ public:
+   GPUPackInfoLocalTester(IBlock* block, BlockDataID fieldId) : GPUPackInfoTester(block, fieldId) {}
+
+ protected:
+   void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) override
+   {
+      gpuPackInfo.communicateLocal(block_, block_, dir);
+   }
+};
+
+int main(int argc, char** argv)
+{
+   using blockforest::createUniformBlockGrid;
+
+   debug::enterTestMode();
+   MPIManager::instance()->initializeMPI(&argc, &argv);
+
+   for (; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex)
+   {
+      // Create BlockForest
+      uint_t const processes = uint_c(MPIManager::instance()->numProcesses());
+      auto blocks            = createUniformBlockGrid(processes, 1, 1,   // blocks
+                                                      2, 2, 2,           // cells
+                                                      1,                 // dx
+                                                      false,             // one block per process
+                                                      true, true, true); // periodicity
+
+      BlockDataID const scalarGPUFieldId =
+         blocks->addStructuredBlockData< gpu::GPUField< int > >(&createGPUField, "ScalarGPUField");
+
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         GPUPackInfoBufferTester bufferTester(&(*blockIt), scalarGPUFieldId);
+         GPUPackInfoLocalTester localTester(&(*blockIt), scalarGPUFieldId);
+
+         for (auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir)
+         {
+            localTester.test(*dir);
+            bufferTester.test(*dir);
+         }
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm/codegen/GeneratedOutflowBC.cpp b/tests/lbm/codegen/GeneratedOutflowBC.cpp
index 453a1e7cc58a23c106f565ec28a7b411a23c2370..3226802ae74d94843d9ed9199c5205acbb0602c2 100644
--- a/tests/lbm/codegen/GeneratedOutflowBC.cpp
+++ b/tests/lbm/codegen/GeneratedOutflowBC.cpp
@@ -70,12 +70,12 @@ Vector3< real_t > ShearProfile::operator()( const Cell& pos, const shared_ptr< S
 {
    Cell globalCell;
    CellInterval domain = SbF->getDomainCellBB();
-   real_t h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
+   real_t const h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
    SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
 
-   real_t u = inflow_velocity_ * (real_c(globalCell[1]) / h_y);
+   real_t const u = inflow_velocity_ * (real_c(globalCell[1]) / h_y);
 
-   Vector3< real_t > result(u, 0.0, 0.0);
+   Vector3< real_t > const result(u, 0.0, 0.0);
    return result;
 }
 
@@ -96,15 +96,12 @@ int main(int argc, char** argv)
    const real_t u_max     = parameters.getParameter< real_t >("u_max", real_c(0.05));
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
-
    // create fields
    BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
    BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-   BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
+   BlockDataID const densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
 
-   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   BlockDataID const flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    pystencils::GeneratedOutflowBC_MacroSetter setterSweep(pdfFieldID, velFieldID);
    for (auto& block : *blocks)
@@ -115,7 +112,7 @@ int main(int argc, char** argv)
 
    auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
 
-   ShearProfile velocityCallback{u_max};
+   ShearProfile const velocityCallback{u_max};
    std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
       velocity_initialisation = velocityCallback;
 
@@ -148,12 +145,8 @@ int main(int argc, char** argv)
    timeloop.add() << Sweep(outflow, "outflow boundary");
    timeloop.add() << Sweep(UpdateSweep, "LB stream & collide");
 
-   // log remaining time
-   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-                                 "remaining time logger");
-
    // VTK Writer
-   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
    if (vtkWriteFrequency > 0)
    {
       auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "GeneratedOutflowBC_VTK", vtkWriteFrequency, 0, false,
@@ -170,7 +163,7 @@ int main(int argc, char** argv)
    timeloop.run();
 
    CellInterval domain = blocks->getDomainCellBB();
-   real_t h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
+   real_t const h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
    for (auto& block : *blocks)
    {
       auto velField = block.getData<VelocityField_T>(velFieldID);
diff --git a/tests/lbm/codegen/LbCodeGenerationExample.cpp b/tests/lbm/codegen/LbCodeGenerationExample.cpp
index 4711fb1b91a024bde929ed19b44fca2dd9e018c0..99087e897c3b9db25ac7524cf95cd6e12a666ec1 100644
--- a/tests/lbm/codegen/LbCodeGenerationExample.cpp
+++ b/tests/lbm/codegen/LbCodeGenerationExample.cpp
@@ -68,8 +68,8 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    // create fields
    BlockDataID forceFieldId = field::addToStorage< VectorField_T >(blocks, "Force", real_c(0.0), field::fzyx);
diff --git a/tests/timeloop/MultipleSweepFailTest.cpp b/tests/timeloop/MultipleSweepFailTest.cpp
index 61a8eb4c40e689788d3107b50b4e4453b1054374..ee9147a2294be689b0d07c8e07fae43f4400cc62 100644
--- a/tests/timeloop/MultipleSweepFailTest.cpp
+++ b/tests/timeloop/MultipleSweepFailTest.cpp
@@ -41,7 +41,7 @@ namespace MultipleSweepFailTest
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   mpi::Environment env(argc, argv);
+   mpi::Environment const env(argc, argv);
 
    const std::shared_ptr< StructuredBlockForest > blockForest = blockforest::createUniformBlockGrid(
       uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), real_c(1), false, false, false, false);
diff --git a/tests/timeloop/MultipleSweepTest.cpp b/tests/timeloop/MultipleSweepTest.cpp
index 3ad8248ffd047781787fada4720eddfa42033f81..4f6138efdc2b6397426d81978de564be51fb09a7 100644
--- a/tests/timeloop/MultipleSweepTest.cpp
+++ b/tests/timeloop/MultipleSweepTest.cpp
@@ -39,7 +39,7 @@ namespace MultipleSweepTest
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   mpi::Environment env(argc, argv);
+   mpi::Environment const env(argc, argv);
 
    const std::shared_ptr< StructuredBlockForest > blockForest = blockforest::createUniformBlockGrid(
       uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), real_c(1), false, false, false, false);