From 95f81f6e91eec4defc539f450bdfec4fb91136df Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Tue, 16 Mar 2021 19:56:20 +0100
Subject: [PATCH] Drop QPX vectorization

---
 src/lbm/srt/SplitPureSweep.impl.h           |    9 -
 src/lbm/srt/bluegeneq/SplitPureSweep.impl.h | 1847 ----------------
 src/lbm/trt/SplitPureSweep.impl.h           |    9 -
 src/lbm/trt/bluegeneq/SplitPureSweep.impl.h | 2078 -------------------
 src/simd/QPX.h                              |  203 --
 src/simd/SIMD.h                             |   17 -
 tests/simd/CMakeLists.txt                   |    6 -
 tests/simd/SIMD_Equivalence.cpp             |   21 -
 8 files changed, 4190 deletions(-)
 delete mode 100644 src/lbm/srt/bluegeneq/SplitPureSweep.impl.h
 delete mode 100644 src/lbm/trt/bluegeneq/SplitPureSweep.impl.h
 delete mode 100644 src/simd/QPX.h

diff --git a/src/lbm/srt/SplitPureSweep.impl.h b/src/lbm/srt/SplitPureSweep.impl.h
index e1cfe424b..1bc0d9165 100644
--- a/src/lbm/srt/SplitPureSweep.impl.h
+++ b/src/lbm/srt/SplitPureSweep.impl.h
@@ -21,8 +21,6 @@
 
 
 
-#ifndef __IBMCPP__
-
 #pragma once
 
 #include "lbm/lattice_model/LatticeModelBase.h"
@@ -1449,10 +1447,3 @@ void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< type
 
 } // namespace lbm
 } // namespace walberla
-
-
-#else // #ifdef __IBMCPP__
-
-#include "lbm/srt/bluegeneq/SplitPureSweep.impl.h"
-
-#endif
diff --git a/src/lbm/srt/bluegeneq/SplitPureSweep.impl.h b/src/lbm/srt/bluegeneq/SplitPureSweep.impl.h
deleted file mode 100644
index 972fb7424..000000000
--- a/src/lbm/srt/bluegeneq/SplitPureSweep.impl.h
+++ /dev/null
@@ -1,1847 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file SplitPureSweep.impl.h
-//! \ingroup lbm
-//! \author Florian Schornbaum <florian.schornbaum@fau.de>
-//
-//======================================================================================================================
-
-
-
-#ifdef __IBMCPP__
-
-#pragma once
-
-#include "lbm/lattice_model/LatticeModelBase.h"
-#include "lbm/lattice_model/D3Q19.h"
-#include "lbm/sweeps/Streaming.h"
-#include "lbm/sweeps/SweepBase.h"
-
-#include <type_traits>
-
-#include <builtins.h>
-
-// OpenMP
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-
-
-namespace walberla {
-namespace lbm {
-
-
-///////////////////////////////////////////////////////
-// Available SRT implementations:                    //
-//                                                   //
-// There are no generic (D*Q*) versions!             //
-//                                                   //
-// Optimized D3Q19 implementation:                   //
-//                     incompressible | compressible //
-//          no forces:       x               x       //
-///////////////////////////////////////////////////////
-
-
-///////////////////////////////
-// Specialization for D3Q19: //
-// - incompressible          //
-// - no additional forces    //
-///////////////////////////////
-
-template< typename LatticeModel_T >
-class SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                               LatticeModel_T::CollisionModel::constant &&
-                                                               std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                               ! LatticeModel_T::compressible &&
-                                                               std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                               >::type > :
-   public SweepBase< LatticeModel_T >
-{
-public:
-
-   static_assert( (std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value), "Only works with SRT!" );
-   static_assert( (std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value),                       "Only works with D3Q19!" );
-   static_assert( LatticeModel_T::compressible == false,                                                             "Only works with incompressible models!" );
-   static_assert( (std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value),        "Only works without additional forces!" );
-   static_assert( LatticeModel_T::equilibriumAccuracyOrder == 2, "Only works for lattice models that require the equilibrium distribution to be order 2 accurate!" );
-
-   typedef typename SweepBase<LatticeModel_T>::PdfField_T  PdfField_T;
-   typedef typename LatticeModel_T::Stencil                Stencil;
-
-   // block has NO dst pdf field
-   SplitPureSweep( const BlockDataID & pdfField ) :
-      SweepBase<LatticeModel_T>( pdfField ) {}
-
-   // every block has a dedicated dst pdf field
-   SplitPureSweep( const BlockDataID & src, const BlockDataID & dst ) :
-      SweepBase<LatticeModel_T>( src, dst ) {}
-
-   void operator()( IBlock * const block );
-
-   void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-   void collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-};
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                              LatticeModel_T::CollisionModel::constant &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag>::value
-                                                              >::type
-   >::operator()( IBlock * const block )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-   WALBERLA_ASSERT_NOT_NULLPTR( dst );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), 1 );
-
-   // constants used during stream/collide
-
-   const real_t omega = src->latticeModel().collisionModel().omega();
-
-   const real_t omega_trm( real_t(1) - omega );
-   const real_t  omega_w0( real_t(3) * ( real_t(1) / real_t( 3) ) * omega );
-   const real_t  omega_w1( real_t(3) * ( real_t(1) / real_t(18) ) * omega );
-   const real_t  omega_w2( real_t(3) * ( real_t(1) / real_t(36) ) * omega );
-   const real_t one_third( real_t(1) / real_t(3) );
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* dir_indep_trm = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx && dst->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pNE = &src->get(-1, y-1, z  , Stencil::idx[NE]);
-            real_t* pN  = &src->get(0 , y-1, z  , Stencil::idx[N]);
-            real_t* pNW = &src->get(+1, y-1, z  , Stencil::idx[NW]);
-            real_t* pW  = &src->get(+1, y  , z  , Stencil::idx[W]);
-            real_t* pSW = &src->get(+1, y+1, z  , Stencil::idx[SW]);
-            real_t* pS  = &src->get(0 , y+1, z  , Stencil::idx[S]);
-            real_t* pSE = &src->get(-1, y+1, z  , Stencil::idx[SE]);
-            real_t* pE  = &src->get(-1, y  , z  , Stencil::idx[E]);
-            real_t* pT  = &src->get(0 , y  , z-1, Stencil::idx[T]);
-            real_t* pTE = &src->get(-1, y  , z-1, Stencil::idx[TE]);
-            real_t* pTN = &src->get(0 , y-1, z-1, Stencil::idx[TN]);
-            real_t* pTW = &src->get(+1, y  , z-1, Stencil::idx[TW]);
-            real_t* pTS = &src->get(0 , y+1, z-1, Stencil::idx[TS]);
-            real_t* pB  = &src->get(0 , y  , z+1, Stencil::idx[B]);
-            real_t* pBE = &src->get(-1, y  , z+1, Stencil::idx[BE]);
-            real_t* pBN = &src->get(0 , y-1, z+1, Stencil::idx[BN]);
-            real_t* pBW = &src->get(+1, y  , z+1, Stencil::idx[BW]);
-            real_t* pBS = &src->get(0 , y+1, z+1, Stencil::idx[BS]);
-            real_t* pC  = &src->get(0 , y  , z  , Stencil::idx[C]);
-
-            real_t* dC = &dst->get(0,y,z,Stencil::idx[C]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, dir_indep_trm  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            __alignx( 32, dC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x];
-               velY[x] = velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x];
-               velZ[x] = velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x];
-
-               dir_indep_trm[x] = one_third * rho - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dC[x] = omega_trm * pC[x] + omega_w0 * dir_indep_trm[x];
-            }
-
-            real_t* dNW = &dst->get(0,y,z,Stencil::idx[NW]);
-            real_t* dSE = &dst->get(0,y,z,Stencil::idx[SE]);
-
-            __alignx( 32, dNW );
-            __alignx( 32, dSE );
-
-            #pragma disjoint( *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dNW[x] = omega_trm * pNW[x] + omega_w2 * ( vel_trm_NW_SE - vel );
-               dSE[x] = omega_trm * pSE[x] + omega_w2 * ( vel_trm_NW_SE + vel );
-            }
-
-            real_t* dNE = &dst->get(0,y,z,Stencil::idx[NE]);
-            real_t* dSW = &dst->get(0,y,z,Stencil::idx[SW]);
-
-            __alignx( 32, dNE );
-            __alignx( 32, dSW );
-
-            #pragma disjoint( *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dNE[x] = omega_trm * pNE[x] + omega_w2 * ( vel_trm_NE_SW + vel );
-               dSW[x] = omega_trm * pSW[x] + omega_w2 * ( vel_trm_NE_SW - vel );
-            }
-
-            real_t* dTW = &dst->get(0,y,z,Stencil::idx[TW]);
-            real_t* dBE = &dst->get(0,y,z,Stencil::idx[BE]);
-
-            __alignx( 32, dTW );
-            __alignx( 32, dBE );
-
-            #pragma disjoint( *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTW[x] = omega_trm * pTW[x] + omega_w2 * ( vel_trm_TW_BE - vel );
-               dBE[x] = omega_trm * pBE[x] + omega_w2 * ( vel_trm_TW_BE + vel );
-            }
-
-            real_t* dTE = &dst->get(0,y,z,Stencil::idx[TE]);
-            real_t* dBW = &dst->get(0,y,z,Stencil::idx[BW]);
-
-            __alignx( 32, dTE );
-            __alignx( 32, dBW );
-
-            #pragma disjoint( *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTE[x] = omega_trm * pTE[x] + omega_w2 * ( vel_trm_TE_BW + vel );
-               dBW[x] = omega_trm * pBW[x] + omega_w2 * ( vel_trm_TE_BW - vel );
-            }
-
-            real_t* dTS = &dst->get(0,y,z,Stencil::idx[TS]);
-            real_t* dBN = &dst->get(0,y,z,Stencil::idx[BN]);
-
-            __alignx( 32, dTS );
-            __alignx( 32, dBN );
-
-            #pragma disjoint( *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTS[x] = omega_trm * pTS[x] + omega_w2 * ( vel_trm_TS_BN - vel );
-               dBN[x] = omega_trm * pBN[x] + omega_w2 * ( vel_trm_TS_BN + vel );
-            }
-
-            real_t* dTN = &dst->get(0,y,z,Stencil::idx[TN]);
-            real_t* dBS = &dst->get(0,y,z,Stencil::idx[BS]);
-
-            __alignx( 32, dTN );
-            __alignx( 32, dBS );
-
-            #pragma disjoint( *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTN[x] = omega_trm * pTN[x] + omega_w2 * ( vel_trm_TN_BS + vel );
-               dBS[x] = omega_trm * pBS[x] + omega_w2 * ( vel_trm_TN_BS - vel );
-            }
-
-            real_t* dN = &dst->get(0,y,z,Stencil::idx[N]);
-            real_t* dS = &dst->get(0,y,z,Stencil::idx[S]);
-
-            __alignx( 32, dN );
-            __alignx( 32, dS );
-
-            #pragma disjoint( *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               dN[x] = omega_trm * pN[x] + omega_w1 * ( vel_trm_N_S + velY[x] );
-               dS[x] = omega_trm * pS[x] + omega_w1 * ( vel_trm_N_S - velY[x] );
-            }
-
-            real_t* dE = &dst->get(0,y,z,Stencil::idx[E]);
-            real_t* dW = &dst->get(0,y,z,Stencil::idx[W]);
-
-            __alignx( 32, dE );
-            __alignx( 32, dW );
-
-            #pragma disjoint( *dE, *dW, *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               dE[x] = omega_trm * pE[x] + omega_w1 * ( vel_trm_E_W + velX[x] );
-               dW[x] = omega_trm * pW[x] + omega_w1 * ( vel_trm_E_W - velX[x] );
-            }
-
-            real_t* dT = &dst->get(0,y,z,Stencil::idx[T]);
-            real_t* dB = &dst->get(0,y,z,Stencil::idx[B]);
-
-            __alignx( 32, dT );
-            __alignx( 32, dB );
-
-            #pragma disjoint( *dT, *dB, *dE, *dW, *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               dT[x] = omega_trm * pT[x] + omega_w1 * ( vel_trm_T_B + velZ[x] );
-               dB[x] = omega_trm * pB[x] + omega_w1 * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf || dst->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z  , Stencil::idx[NE]);
-               const real_t dd_tmp_N  = src->get(x  , y-1, z  , Stencil::idx[N]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z  , Stencil::idx[NW]);
-               const real_t dd_tmp_W  = src->get(x+1, y  , z  , Stencil::idx[W]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z  , Stencil::idx[SW]);
-               const real_t dd_tmp_S  = src->get(x  , y+1, z  , Stencil::idx[S]);
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z  , Stencil::idx[SE]);
-               const real_t dd_tmp_E  = src->get(x-1, y  , z  , Stencil::idx[E]);
-               const real_t dd_tmp_T  = src->get(x  , y  , z-1, Stencil::idx[T]);
-               const real_t dd_tmp_TE = src->get(x-1, y  , z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_TN = src->get(x  , y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_TW = src->get(x+1, y  , z-1, Stencil::idx[TW]);
-               const real_t dd_tmp_TS = src->get(x  , y+1, z-1, Stencil::idx[TS]);
-               const real_t dd_tmp_B  = src->get(x  , y  , z+1, Stencil::idx[B]);
-               const real_t dd_tmp_BE = src->get(x-1, y  , z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_BN = src->get(x  , y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_BW = src->get(x+1, y  , z+1, Stencil::idx[BW]);
-               const real_t dd_tmp_BS = src->get(x  , y+1, z+1, Stencil::idx[BS]);
-               const real_t dd_tmp_C  = src->get(x  , y  , z  , Stencil::idx[C]);
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW;
-               velY[x] = velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS;
-               velZ[x] = velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE;
-
-               dir_indep_trm[x] = one_third * rho - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dst->get(x,y,z,Stencil::idx[C]) = omega_trm * dd_tmp_C + omega_w0 * dir_indep_trm[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[NW]) = omega_trm * src->get(x+1, y-1, z, Stencil::idx[NW]) + omega_w2 * ( vel_trm_NW_SE - vel );
-               dst->get(x,y,z,Stencil::idx[SE]) = omega_trm * src->get(x-1, y+1, z, Stencil::idx[SE]) + omega_w2 * ( vel_trm_NW_SE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[NE]) = omega_trm * src->get(x-1, y-1, z, Stencil::idx[NE]) + omega_w2 * ( vel_trm_NE_SW + vel );
-               dst->get(x,y,z,Stencil::idx[SW]) = omega_trm * src->get(x+1, y+1, z, Stencil::idx[SW]) + omega_w2 * ( vel_trm_NE_SW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TW]) = omega_trm * src->get(x+1, y, z-1, Stencil::idx[TW]) + omega_w2 * ( vel_trm_TW_BE - vel );
-               dst->get(x,y,z,Stencil::idx[BE]) = omega_trm * src->get(x-1, y, z+1, Stencil::idx[BE]) + omega_w2 * ( vel_trm_TW_BE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TE]) = omega_trm * src->get(x-1, y, z-1, Stencil::idx[TE]) + omega_w2 * ( vel_trm_TE_BW + vel );
-               dst->get(x,y,z,Stencil::idx[BW]) = omega_trm * src->get(x+1, y, z+1, Stencil::idx[BW]) + omega_w2 * ( vel_trm_TE_BW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TS]) = omega_trm * src->get(x, y+1, z-1, Stencil::idx[TS]) + omega_w2 * ( vel_trm_TS_BN - vel );
-               dst->get(x,y,z,Stencil::idx[BN]) = omega_trm * src->get(x, y-1, z+1, Stencil::idx[BN]) + omega_w2 * ( vel_trm_TS_BN + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TN]) = omega_trm * src->get(x, y-1, z-1, Stencil::idx[TN]) + omega_w2 * ( vel_trm_TN_BS + vel );
-               dst->get(x,y,z,Stencil::idx[BS]) = omega_trm * src->get(x, y+1, z+1, Stencil::idx[BS]) + omega_w2 * ( vel_trm_TN_BS - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               dst->get(x,y,z,Stencil::idx[N]) = omega_trm * src->get(x, y-1, z, Stencil::idx[N]) + omega_w1 * ( vel_trm_N_S + velY[x] );
-               dst->get(x,y,z,Stencil::idx[S]) = omega_trm * src->get(x, y+1, z, Stencil::idx[S]) + omega_w1 * ( vel_trm_N_S - velY[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               dst->get(x,y,z,Stencil::idx[E]) = omega_trm * src->get(x-1, y, z, Stencil::idx[E]) + omega_w1 * ( vel_trm_E_W + velX[x] );
-               dst->get(x,y,z,Stencil::idx[W]) = omega_trm * src->get(x+1, y, z, Stencil::idx[W]) + omega_w1 * ( vel_trm_E_W - velX[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               dst->get(x,y,z,Stencil::idx[T]) = omega_trm * src->get(x, y, z-1, Stencil::idx[T]) + omega_w1 * ( vel_trm_T_B + velZ[x] );
-               dst->get(x,y,z,Stencil::idx[B]) = omega_trm * src->get(x, y, z+1, Stencil::idx[B]) + omega_w1 * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] dir_indep_trm;
-
-#ifdef _OPENMP
-   }
-#endif
-
-   src->swapDataPointers( dst );
-}
-
-
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                              LatticeModel_T::CollisionModel::constant &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-   >::stream( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   StreamEverything< LatticeModel_T >::execute( src, dst, numberOfGhostLayersToInclude );
-}
-
-
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                              LatticeModel_T::CollisionModel::constant &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-#ifdef NDEBUG
-   >::collide( IBlock * const block, const uint_t /*numberOfGhostLayersToInclude*/ )
-#else
-   >::collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-#endif
-{
-   WALBERLA_ASSERT_EQUAL( numberOfGhostLayersToInclude, uint_t(0) ); // the implementation right now doesn't support inclusion of ghost layers in collide step!
-
-   PdfField_T * src = this->getSrcField( block );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), numberOfGhostLayersToInclude );
-
-   // constants used during stream/collide
-
-   const real_t omega = src->latticeModel().collisionModel().omega();
-
-   const real_t omega_trm( real_t(1) - omega );
-   const real_t  omega_w0( real_t(3) * ( real_t(1) / real_t( 3) ) * omega );
-   const real_t  omega_w1( real_t(3) * ( real_t(1) / real_t(18) ) * omega );
-   const real_t  omega_w2( real_t(3) * ( real_t(1) / real_t(36) ) * omega );
-   const real_t one_third( real_t(1) / real_t(3) );
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* dir_indep_trm = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pC  = &src->get( 0, y, z, Stencil::idx[C]);
-            real_t* pN  = &src->get( 0, y, z, Stencil::idx[N]);
-            real_t* pS  = &src->get( 0, y, z, Stencil::idx[S]);
-            real_t* pW  = &src->get( 0, y, z, Stencil::idx[W]);
-            real_t* pE  = &src->get( 0, y, z, Stencil::idx[E]);
-            real_t* pT  = &src->get( 0, y, z, Stencil::idx[T]);
-            real_t* pB  = &src->get( 0, y, z, Stencil::idx[B]);
-            real_t* pNW = &src->get( 0, y, z, Stencil::idx[NW]);
-            real_t* pNE = &src->get( 0, y, z, Stencil::idx[NE]);
-            real_t* pSW = &src->get( 0, y, z, Stencil::idx[SW]);
-            real_t* pSE = &src->get( 0, y, z, Stencil::idx[SE]);
-            real_t* pTN = &src->get( 0, y, z, Stencil::idx[TN]);
-            real_t* pTS = &src->get( 0, y, z, Stencil::idx[TS]);
-            real_t* pTW = &src->get( 0, y, z, Stencil::idx[TW]);
-            real_t* pTE = &src->get( 0, y, z, Stencil::idx[TE]);
-            real_t* pBN = &src->get( 0, y, z, Stencil::idx[BN]);
-            real_t* pBS = &src->get( 0, y, z, Stencil::idx[BS]);
-            real_t* pBW = &src->get( 0, y, z, Stencil::idx[BW]);
-            real_t* pBE = &src->get( 0, y, z, Stencil::idx[BE]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, dir_indep_trm  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x];
-               velY[x] = velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x];
-               velZ[x] = velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x];
-
-               dir_indep_trm[x] = one_third * rho - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               pC[x] = omega_trm * pC[x] + omega_w0 * dir_indep_trm[x];
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pNW[x] = omega_trm * pNW[x] + omega_w2 * ( vel_trm_NW_SE - vel );
-               pSE[x] = omega_trm * pSE[x] + omega_w2 * ( vel_trm_NW_SE + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pNE[x] = omega_trm * pNE[x] + omega_w2 * ( vel_trm_NE_SW + vel );
-               pSW[x] = omega_trm * pSW[x] + omega_w2 * ( vel_trm_NE_SW - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTW[x] = omega_trm * pTW[x] + omega_w2 * ( vel_trm_TW_BE - vel );
-               pBE[x] = omega_trm * pBE[x] + omega_w2 * ( vel_trm_TW_BE + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTE[x] = omega_trm * pTE[x] + omega_w2 * ( vel_trm_TE_BW + vel );
-               pBW[x] = omega_trm * pBW[x] + omega_w2 * ( vel_trm_TE_BW - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTS[x] = omega_trm * pTS[x] + omega_w2 * ( vel_trm_TS_BN - vel );
-               pBN[x] = omega_trm * pBN[x] + omega_w2 * ( vel_trm_TS_BN + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTN[x] = omega_trm * pTN[x] + omega_w2 * ( vel_trm_TN_BS + vel );
-               pBS[x] = omega_trm * pBS[x] + omega_w2 * ( vel_trm_TN_BS - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               pN[x] = omega_trm * pN[x] + omega_w1 * ( vel_trm_N_S + velY[x] );
-               pS[x] = omega_trm * pS[x] + omega_w1 * ( vel_trm_N_S - velY[x] );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               pE[x] = omega_trm * pE[x] + omega_w1 * ( vel_trm_E_W + velX[x] );
-               pW[x] = omega_trm * pW[x] + omega_w1 * ( vel_trm_E_W - velX[x] );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               pT[x] = omega_trm * pT[x] + omega_w1 * ( vel_trm_T_B + velZ[x] );
-               pB[x] = omega_trm * pB[x] + omega_w1 * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_C  = src->get( x, y, z, Stencil::idx[C]  );
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]  );
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]  );
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]  );
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]  );
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]  );
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]  );
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW] );
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE] );
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW] );
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE] );
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN] );
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS] );
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW] );
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE] );
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN] );
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS] );
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW] );
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE] );
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW;
-               velY[x] = velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS;
-               velZ[x] = velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE;
-
-               dir_indep_trm[x] = one_third * rho - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               src->get(x,y,z,Stencil::idx[C]) = omega_trm * dd_tmp_C + omega_w0 * dir_indep_trm[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[NW]) = omega_trm * src->get(x,y,z,Stencil::idx[NW]) + omega_w2 * ( vel_trm_NW_SE - vel );
-               src->get(x,y,z,Stencil::idx[SE]) = omega_trm * src->get(x,y,z,Stencil::idx[SE]) + omega_w2 * ( vel_trm_NW_SE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[NE]) = omega_trm * src->get(x,y,z,Stencil::idx[NE]) + omega_w2 * ( vel_trm_NE_SW + vel );
-               src->get(x,y,z,Stencil::idx[SW]) = omega_trm * src->get(x,y,z,Stencil::idx[SW]) + omega_w2 * ( vel_trm_NE_SW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TW]) = omega_trm * src->get(x,y,z,Stencil::idx[TW]) + omega_w2 * ( vel_trm_TW_BE - vel );
-               src->get(x,y,z,Stencil::idx[BE]) = omega_trm * src->get(x,y,z,Stencil::idx[BE]) + omega_w2 * ( vel_trm_TW_BE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TE]) = omega_trm * src->get(x,y,z,Stencil::idx[TE]) + omega_w2 * ( vel_trm_TE_BW + vel );
-               src->get(x,y,z,Stencil::idx[BW]) = omega_trm * src->get(x,y,z,Stencil::idx[BW]) + omega_w2 * ( vel_trm_TE_BW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TS]) = omega_trm * src->get(x,y,z,Stencil::idx[TS]) + omega_w2 * ( vel_trm_TS_BN - vel );
-               src->get(x,y,z,Stencil::idx[BN]) = omega_trm * src->get(x,y,z,Stencil::idx[BN]) + omega_w2 * ( vel_trm_TS_BN + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TN]) = omega_trm * src->get(x,y,z,Stencil::idx[TN]) + omega_w2 * ( vel_trm_TN_BS + vel );
-               src->get(x,y,z,Stencil::idx[BS]) = omega_trm * src->get(x,y,z,Stencil::idx[BS]) + omega_w2 * ( vel_trm_TN_BS - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               src->get(x,y,z,Stencil::idx[N]) = omega_trm * src->get(x,y,z,Stencil::idx[N]) + omega_w1 * ( vel_trm_N_S + velY[x] );
-               src->get(x,y,z,Stencil::idx[S]) = omega_trm * src->get(x,y,z,Stencil::idx[S]) + omega_w1 * ( vel_trm_N_S - velY[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               src->get(x,y,z,Stencil::idx[E]) = omega_trm * src->get(x,y,z,Stencil::idx[E]) + omega_w1 * ( vel_trm_E_W + velX[x] );
-               src->get(x,y,z,Stencil::idx[W]) = omega_trm * src->get(x,y,z,Stencil::idx[W]) + omega_w1 * ( vel_trm_E_W - velX[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               src->get(x,y,z,Stencil::idx[T]) = omega_trm * src->get(x,y,z,Stencil::idx[T]) + omega_w1 * ( vel_trm_T_B + velZ[x] );
-               src->get(x,y,z,Stencil::idx[B]) = omega_trm * src->get(x,y,z,Stencil::idx[B]) + omega_w1 * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] dir_indep_trm;
-
-#ifdef _OPENMP
-   }
-#endif
-}
-
-
-
-///////////////////////////////
-// Specialization for D3Q19: //
-// - compressible          //
-// - no additional forces    //
-///////////////////////////////
-
-template< typename LatticeModel_T >
-class SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                               LatticeModel_T::CollisionModel::constant &&
-                                                               std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                               LatticeModel_T::compressible,
-                                                               std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                               >::type > :
-   public SweepBase< LatticeModel_T >
-{
-public:
-
-   static_assert( (std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value), "Only works with SRT!" );
-   static_assert( (std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value),                       "Only works with D3Q19!" );
-   static_assert( LatticeModel_T::compressible,                                                                      "Only works with compressible models!" );
-   static_assert( (std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value),        "Only works without additional forces!" );
-   static_assert( LatticeModel_T::equilibriumAccuracyOrder == 2, "Only works for lattice models that require the equilibrium distribution to be order 2 accurate!" );
-
-   typedef typename SweepBase<LatticeModel_T>::PdfField_T  PdfField_T;
-   typedef typename LatticeModel_T::Stencil                Stencil;
-
-   // block has NO dst pdf field
-   SplitPureSweep( const BlockDataID & pdfField ) :
-      SweepBase<LatticeModel_T>( pdfField ) {}
-
-   // every block has a dedicated dst pdf field
-   SplitPureSweep( const BlockDataID & src, const BlockDataID & dst ) :
-      SweepBase<LatticeModel_T>( src, dst ) {}
-
-   void operator()( IBlock * const block );
-
-   void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-   void collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-};
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                              LatticeModel_T::CollisionModel::constant &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              LatticeModel_T::compressible &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-   >::operator()( IBlock * const block )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-   WALBERLA_ASSERT_NOT_NULLPTR( dst );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), 1 );
-
-   // constants used during stream/collide
-
-   const real_t omega = src->latticeModel().collisionModel().omega();
-
-   const real_t omega_trm( real_t(1) - omega );
-   const real_t  omega_w0( real_t(3) * ( real_t(1) / real_t( 3) ) * omega );
-   const real_t  omega_w1( real_t(3) * ( real_t(1) / real_t(18) ) * omega );
-   const real_t  omega_w2( real_t(3) * ( real_t(1) / real_t(36) ) * omega );
-   const real_t one_third( real_t(1) / real_t(3) );
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* rho = new real_t[ uint_c(xSize) ];
-
-   real_t* dir_indep_trm = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx && dst->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pNE = &src->get(-1, y-1, z  , Stencil::idx[NE]);
-            real_t* pN  = &src->get(0 , y-1, z  , Stencil::idx[N]);
-            real_t* pNW = &src->get(+1, y-1, z  , Stencil::idx[NW]);
-            real_t* pW  = &src->get(+1, y  , z  , Stencil::idx[W]);
-            real_t* pSW = &src->get(+1, y+1, z  , Stencil::idx[SW]);
-            real_t* pS  = &src->get(0 , y+1, z  , Stencil::idx[S]);
-            real_t* pSE = &src->get(-1, y+1, z  , Stencil::idx[SE]);
-            real_t* pE  = &src->get(-1, y  , z  , Stencil::idx[E]);
-            real_t* pT  = &src->get(0 , y  , z-1, Stencil::idx[T]);
-            real_t* pTE = &src->get(-1, y  , z-1, Stencil::idx[TE]);
-            real_t* pTN = &src->get(0 , y-1, z-1, Stencil::idx[TN]);
-            real_t* pTW = &src->get(+1, y  , z-1, Stencil::idx[TW]);
-            real_t* pTS = &src->get(0 , y+1, z-1, Stencil::idx[TS]);
-            real_t* pB  = &src->get(0 , y  , z+1, Stencil::idx[B]);
-            real_t* pBE = &src->get(-1, y  , z+1, Stencil::idx[BE]);
-            real_t* pBN = &src->get(0 , y-1, z+1, Stencil::idx[BN]);
-            real_t* pBW = &src->get(+1, y  , z+1, Stencil::idx[BW]);
-            real_t* pBS = &src->get(0 , y+1, z+1, Stencil::idx[BS]);
-            real_t* pC  = &src->get(0 , y  , z  , Stencil::idx[C]);
-
-            real_t* dC = &dst->get(0,y,z,Stencil::idx[C]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, rho );
-            __alignx( 32, dir_indep_trm  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            __alignx( 32, dC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-                 const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-                 const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-                 const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-                 rho[x] = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-                 const real_t rho_inv = real_t(1) / rho[x];
-
-                 velX[x] = rho_inv * ( velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x] );
-                 velY[x] = rho_inv * ( velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x] );
-                 velZ[x] = rho_inv * ( velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x] );
-
-                 dir_indep_trm[x] = one_third - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-                 dC[x] = omega_trm * pC[x] + omega_w0 * rho[x] * dir_indep_trm[x];
-            }
-
-            real_t* dNW = &dst->get(0,y,z,Stencil::idx[NW]);
-            real_t* dSE = &dst->get(0,y,z,Stencil::idx[SE]);
-
-            __alignx( 32, dNW );
-            __alignx( 32, dSE );
-
-            #pragma disjoint( *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dNW[x] = omega_trm * pNW[x] + omega_w2 * rho[x] * ( vel_trm_NW_SE - vel );
-               dSE[x] = omega_trm * pSE[x] + omega_w2 * rho[x] * ( vel_trm_NW_SE + vel );
-            }
-
-            real_t* dNE = &dst->get(0,y,z,Stencil::idx[NE]);
-            real_t* dSW = &dst->get(0,y,z,Stencil::idx[SW]);
-
-            __alignx( 32, dNE );
-            __alignx( 32, dSW );
-
-            #pragma disjoint( *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dNE[x] = omega_trm * pNE[x] + omega_w2 * rho[x] * ( vel_trm_NE_SW + vel );
-               dSW[x] = omega_trm * pSW[x] + omega_w2 * rho[x] * ( vel_trm_NE_SW - vel );
-            }
-
-            real_t* dTW = &dst->get(0,y,z,Stencil::idx[TW]);
-            real_t* dBE = &dst->get(0,y,z,Stencil::idx[BE]);
-
-            __alignx( 32, dTW );
-            __alignx( 32, dBE );
-
-            #pragma disjoint( *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTW[x] = omega_trm * pTW[x] + omega_w2 * rho[x] * ( vel_trm_TW_BE - vel );
-               dBE[x] = omega_trm * pBE[x] + omega_w2 * rho[x] * ( vel_trm_TW_BE + vel );
-            }
-
-            real_t* dTE = &dst->get(0,y,z,Stencil::idx[TE]);
-            real_t* dBW = &dst->get(0,y,z,Stencil::idx[BW]);
-
-            __alignx( 32, dTE );
-            __alignx( 32, dBW );
-
-            #pragma disjoint( *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTE[x] = omega_trm * pTE[x] + omega_w2 * rho[x] * ( vel_trm_TE_BW + vel );
-               dBW[x] = omega_trm * pBW[x] + omega_w2 * rho[x] * ( vel_trm_TE_BW - vel );
-            }
-
-            real_t* dTS = &dst->get(0,y,z,Stencil::idx[TS]);
-            real_t* dBN = &dst->get(0,y,z,Stencil::idx[BN]);
-
-            __alignx( 32, dTS );
-            __alignx( 32, dBN );
-
-            #pragma disjoint( *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTS[x] = omega_trm * pTS[x] + omega_w2 * rho[x] * ( vel_trm_TS_BN - vel );
-               dBN[x] = omega_trm * pBN[x] + omega_w2 * rho[x] * ( vel_trm_TS_BN + vel );
-            }
-
-            real_t* dTN = &dst->get(0,y,z,Stencil::idx[TN]);
-            real_t* dBS = &dst->get(0,y,z,Stencil::idx[BS]);
-
-            __alignx( 32, dTN );
-            __alignx( 32, dBS );
-
-            #pragma disjoint( *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dTN[x] = omega_trm * pTN[x] + omega_w2 * rho[x] * ( vel_trm_TN_BS + vel );
-               dBS[x] = omega_trm * pBS[x] + omega_w2 * rho[x] * ( vel_trm_TN_BS - vel );
-            }
-
-            real_t* dN = &dst->get(0,y,z,Stencil::idx[N]);
-            real_t* dS = &dst->get(0,y,z,Stencil::idx[S]);
-
-            __alignx( 32, dN );
-            __alignx( 32, dS );
-
-            #pragma disjoint( *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               dN[x] = omega_trm * pN[x] + omega_w1 * rho[x] * ( vel_trm_N_S + velY[x] );
-               dS[x] = omega_trm * pS[x] + omega_w1 * rho[x] * ( vel_trm_N_S - velY[x] );
-            }
-
-            real_t* dE = &dst->get(0,y,z,Stencil::idx[E]);
-            real_t* dW = &dst->get(0,y,z,Stencil::idx[W]);
-
-            __alignx( 32, dE );
-            __alignx( 32, dW );
-
-            #pragma disjoint( *dE, *dW, *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               dE[x] = omega_trm * pE[x] + omega_w1 * rho[x] * ( vel_trm_E_W + velX[x] );
-               dW[x] = omega_trm * pW[x] + omega_w1 * rho[x] * ( vel_trm_E_W - velX[x] );
-            }
-
-            real_t* dT = &dst->get(0,y,z,Stencil::idx[T]);
-            real_t* dB = &dst->get(0,y,z,Stencil::idx[B]);
-
-            __alignx( 32, dT );
-            __alignx( 32, dB );
-
-            #pragma disjoint( *dT, *dB, *dE, *dW, *dN, *dS, *dTN, *dBS, *dTS, *dBN, *dTE, *dBW, *dTW, *dBE, *dNE, *dSW, *dNW, *dSE, \
-                              *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               dT[x] = omega_trm * pT[x] + omega_w1 * rho[x] * ( vel_trm_T_B + velZ[x] );
-               dB[x] = omega_trm * pB[x] + omega_w1 * rho[x] * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf || dst->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z  , Stencil::idx[NE]);
-               const real_t dd_tmp_N  = src->get(x  , y-1, z  , Stencil::idx[N]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z  , Stencil::idx[NW]);
-               const real_t dd_tmp_W  = src->get(x+1, y  , z  , Stencil::idx[W]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z  , Stencil::idx[SW]);
-               const real_t dd_tmp_S  = src->get(x  , y+1, z  , Stencil::idx[S]);
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z  , Stencil::idx[SE]);
-               const real_t dd_tmp_E  = src->get(x-1, y  , z  , Stencil::idx[E]);
-               const real_t dd_tmp_T  = src->get(x  , y  , z-1, Stencil::idx[T]);
-               const real_t dd_tmp_TE = src->get(x-1, y  , z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_TN = src->get(x  , y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_TW = src->get(x+1, y  , z-1, Stencil::idx[TW]);
-               const real_t dd_tmp_TS = src->get(x  , y+1, z-1, Stencil::idx[TS]);
-               const real_t dd_tmp_B  = src->get(x  , y  , z+1, Stencil::idx[B]);
-               const real_t dd_tmp_BE = src->get(x-1, y  , z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_BN = src->get(x  , y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_BW = src->get(x+1, y  , z+1, Stencil::idx[BW]);
-               const real_t dd_tmp_BS = src->get(x  , y+1, z+1, Stencil::idx[BS]);
-               const real_t dd_tmp_C  = src->get(x  , y  , z  , Stencil::idx[C]);
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               rho[x] = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-               const real_t rho_inv = real_t(1) / rho[x];
-
-               velX[x] = rho_inv * ( velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW );
-               velY[x] = rho_inv * ( velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS );
-               velZ[x] = rho_inv * ( velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE );
-
-               dir_indep_trm[x] = one_third - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dst->get(x,y,z,Stencil::idx[C]) = omega_trm * dd_tmp_C + omega_w0 * rho[x] * dir_indep_trm[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[NW]) = omega_trm * src->get(x+1, y-1, z, Stencil::idx[NW]) + omega_w2 * rho[x] * ( vel_trm_NW_SE - vel );
-               dst->get(x,y,z,Stencil::idx[SE]) = omega_trm * src->get(x-1, y+1, z, Stencil::idx[SE]) + omega_w2 * rho[x] * ( vel_trm_NW_SE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[NE]) = omega_trm * src->get(x-1, y-1, z, Stencil::idx[NE]) + omega_w2 * rho[x] * ( vel_trm_NE_SW + vel );
-               dst->get(x,y,z,Stencil::idx[SW]) = omega_trm * src->get(x+1, y+1, z, Stencil::idx[SW]) + omega_w2 * rho[x] * ( vel_trm_NE_SW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TW]) = omega_trm * src->get(x+1, y, z-1, Stencil::idx[TW]) + omega_w2 * rho[x] * ( vel_trm_TW_BE - vel );
-               dst->get(x,y,z,Stencil::idx[BE]) = omega_trm * src->get(x-1, y, z+1, Stencil::idx[BE]) + omega_w2 * rho[x] * ( vel_trm_TW_BE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TE]) = omega_trm * src->get(x-1, y, z-1, Stencil::idx[TE]) + omega_w2 * rho[x] * ( vel_trm_TE_BW + vel );
-               dst->get(x,y,z,Stencil::idx[BW]) = omega_trm * src->get(x+1, y, z+1, Stencil::idx[BW]) + omega_w2 * rho[x] * ( vel_trm_TE_BW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TS]) = omega_trm * src->get(x, y+1, z-1, Stencil::idx[TS]) + omega_w2 * rho[x] * ( vel_trm_TS_BN - vel );
-               dst->get(x,y,z,Stencil::idx[BN]) = omega_trm * src->get(x, y-1, z+1, Stencil::idx[BN]) + omega_w2 * rho[x] * ( vel_trm_TS_BN + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               dst->get(x,y,z,Stencil::idx[TN]) = omega_trm * src->get(x, y-1, z-1, Stencil::idx[TN]) + omega_w2 * rho[x] * ( vel_trm_TN_BS + vel );
-               dst->get(x,y,z,Stencil::idx[BS]) = omega_trm * src->get(x, y+1, z+1, Stencil::idx[BS]) + omega_w2 * rho[x] * ( vel_trm_TN_BS - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               dst->get(x,y,z,Stencil::idx[N]) = omega_trm * src->get(x, y-1, z, Stencil::idx[N]) + omega_w1 * rho[x] * ( vel_trm_N_S + velY[x] );
-               dst->get(x,y,z,Stencil::idx[S]) = omega_trm * src->get(x, y+1, z, Stencil::idx[S]) + omega_w1 * rho[x] * ( vel_trm_N_S - velY[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               dst->get(x,y,z,Stencil::idx[E]) = omega_trm * src->get(x-1, y, z, Stencil::idx[E]) + omega_w1 * rho[x] * ( vel_trm_E_W + velX[x] );
-               dst->get(x,y,z,Stencil::idx[W]) = omega_trm * src->get(x+1, y, z, Stencil::idx[W]) + omega_w1 * rho[x] * ( vel_trm_E_W - velX[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               dst->get(x,y,z,Stencil::idx[T]) = omega_trm * src->get(x, y, z-1, Stencil::idx[T]) + omega_w1 * rho[x] * ( vel_trm_T_B + velZ[x] );
-               dst->get(x,y,z,Stencil::idx[B]) = omega_trm * src->get(x, y, z+1, Stencil::idx[B]) + omega_w1 * rho[x] * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] rho;
-   delete[] dir_indep_trm;
-
-#ifdef _OPENMP
-   }
-#endif
-
-   src->swapDataPointers( dst );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                                LatticeModel_T::CollisionModel::constant &&
-                                                                std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                                LatticeModel_T::compressible &&
-                                                                std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                                >::type
-   >::stream( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   StreamEverything< LatticeModel_T >::execute( src, dst, numberOfGhostLayersToInclude );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::SRT_tag >::value &&
-                                                              LatticeModel_T::CollisionModel::constant &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              LatticeModel_T::compressible &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-#ifdef NDEBUG
-   >::collide( IBlock * const block, const uint_t /*numberOfGhostLayersToInclude*/ )
-#else
-   >::collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-#endif
-{
-   WALBERLA_ASSERT_EQUAL( numberOfGhostLayersToInclude, uint_t(0) ); // the implementation right now doesn't support inclusion of ghost layers in collide step!
-
-   PdfField_T * src = this->getSrcField( block );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), numberOfGhostLayersToInclude );
-
-   // constants used during stream/collide
-
-   const real_t omega = src->latticeModel().collisionModel().omega();
-
-   const real_t omega_trm( real_t(1) - omega );
-   const real_t  omega_w0( real_t(3) * ( real_t(1) / real_t( 3) ) * omega );
-   const real_t  omega_w1( real_t(3) * ( real_t(1) / real_t(18) ) * omega );
-   const real_t  omega_w2( real_t(3) * ( real_t(1) / real_t(36) ) * omega );
-   const real_t one_third( real_t(1) / real_t(3) );
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* rho = new real_t[ uint_c(xSize) ];
-
-   real_t* dir_indep_trm = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pC  = &src->get( 0, y, z, Stencil::idx[C]);
-            real_t* pN  = &src->get( 0, y, z, Stencil::idx[N]);
-            real_t* pS  = &src->get( 0, y, z, Stencil::idx[S]);
-            real_t* pW  = &src->get( 0, y, z, Stencil::idx[W]);
-            real_t* pE  = &src->get( 0, y, z, Stencil::idx[E]);
-            real_t* pT  = &src->get( 0, y, z, Stencil::idx[T]);
-            real_t* pB  = &src->get( 0, y, z, Stencil::idx[B]);
-            real_t* pNW = &src->get( 0, y, z, Stencil::idx[NW]);
-            real_t* pNE = &src->get( 0, y, z, Stencil::idx[NE]);
-            real_t* pSW = &src->get( 0, y, z, Stencil::idx[SW]);
-            real_t* pSE = &src->get( 0, y, z, Stencil::idx[SE]);
-            real_t* pTN = &src->get( 0, y, z, Stencil::idx[TN]);
-            real_t* pTS = &src->get( 0, y, z, Stencil::idx[TS]);
-            real_t* pTW = &src->get( 0, y, z, Stencil::idx[TW]);
-            real_t* pTE = &src->get( 0, y, z, Stencil::idx[TE]);
-            real_t* pBN = &src->get( 0, y, z, Stencil::idx[BN]);
-            real_t* pBS = &src->get( 0, y, z, Stencil::idx[BS]);
-            real_t* pBW = &src->get( 0, y, z, Stencil::idx[BW]);
-            real_t* pBE = &src->get( 0, y, z, Stencil::idx[BE]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, rho );
-            __alignx( 32, dir_indep_trm  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *rho, *dir_indep_trm, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, \
-                              omega_trm, omega_w0, omega_w1, omega_w2, one_third )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               rho[x] = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-               const real_t rho_inv = real_t(1) / rho[x];
-
-               velX[x] = rho_inv * ( velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x] );
-               velY[x] = rho_inv * ( velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x] );
-               velZ[x] = rho_inv * ( velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x] );
-
-               dir_indep_trm[x] = one_third - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               pC[x] = omega_trm * pC[x] + omega_w0 * rho[x] * dir_indep_trm[x];
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pNW[x] = omega_trm * pNW[x] + omega_w2 * rho[x] * ( vel_trm_NW_SE - vel );
-               pSE[x] = omega_trm * pSE[x] + omega_w2 * rho[x] * ( vel_trm_NW_SE + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pNE[x] = omega_trm * pNE[x] + omega_w2 * rho[x] * ( vel_trm_NE_SW + vel );
-               pSW[x] = omega_trm * pSW[x] + omega_w2 * rho[x] * ( vel_trm_NE_SW - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTW[x] = omega_trm * pTW[x] + omega_w2 * rho[x] * ( vel_trm_TW_BE - vel );
-               pBE[x] = omega_trm * pBE[x] + omega_w2 * rho[x] * ( vel_trm_TW_BE + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTE[x] = omega_trm * pTE[x] + omega_w2 * rho[x] * ( vel_trm_TE_BW + vel );
-               pBW[x] = omega_trm * pBW[x] + omega_w2 * rho[x] * ( vel_trm_TE_BW - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTS[x] = omega_trm * pTS[x] + omega_w2 * rho[x] * ( vel_trm_TS_BN - vel );
-               pBN[x] = omega_trm * pBN[x] + omega_w2 * rho[x] * ( vel_trm_TS_BN + vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               pTN[x] = omega_trm * pTN[x] + omega_w2 * rho[x] * ( vel_trm_TN_BS + vel );
-               pBS[x] = omega_trm * pBS[x] + omega_w2 * rho[x] * ( vel_trm_TN_BS - vel );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               pN[x] = omega_trm * pN[x] + omega_w1 * rho[x] * ( vel_trm_N_S + velY[x] );
-               pS[x] = omega_trm * pS[x] + omega_w1 * rho[x] * ( vel_trm_N_S - velY[x] );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               pE[x] = omega_trm * pE[x] + omega_w1 * rho[x] * ( vel_trm_E_W + velX[x] );
-               pW[x] = omega_trm * pW[x] + omega_w1 * rho[x] * ( vel_trm_E_W - velX[x] );
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               pT[x] = omega_trm * pT[x] + omega_w1 * rho[x] * ( vel_trm_T_B + velZ[x] );
-               pB[x] = omega_trm * pB[x] + omega_w1 * rho[x] * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_C  = src->get( x, y, z, Stencil::idx[C]  );
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]  );
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]  );
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]  );
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]  );
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]  );
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]  );
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW] );
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE] );
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW] );
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE] );
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN] );
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS] );
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW] );
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE] );
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN] );
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS] );
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW] );
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE] );
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               rho[x] = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-               const real_t rho_inv = real_t(1) / rho[x];
-
-               velX[x] = rho_inv * ( velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW );
-               velY[x] = rho_inv * ( velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS );
-               velZ[x] = rho_inv * ( velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE );
-
-               dir_indep_trm[x] = one_third - real_c(0.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               src->get(x,y,z,Stencil::idx[C]) = omega_trm * dd_tmp_C + omega_w0 * rho[x] * dir_indep_trm[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velY[x];
-               const real_t vel_trm_NW_SE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[NW]) = omega_trm * src->get(x,y,z,Stencil::idx[NW]) + omega_w2 * rho[x] * ( vel_trm_NW_SE - vel );
-               src->get(x,y,z,Stencil::idx[SE]) = omega_trm * src->get(x,y,z,Stencil::idx[SE]) + omega_w2 * rho[x] * ( vel_trm_NW_SE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velY[x];
-               const real_t vel_trm_NE_SW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[NE]) = omega_trm * src->get(x,y,z,Stencil::idx[NE]) + omega_w2 * rho[x] * ( vel_trm_NE_SW + vel );
-               src->get(x,y,z,Stencil::idx[SW]) = omega_trm * src->get(x,y,z,Stencil::idx[SW]) + omega_w2 * rho[x] * ( vel_trm_NE_SW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] - velZ[x];
-               const real_t vel_trm_TW_BE = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TW]) = omega_trm * src->get(x,y,z,Stencil::idx[TW]) + omega_w2 * rho[x] * ( vel_trm_TW_BE - vel );
-               src->get(x,y,z,Stencil::idx[BE]) = omega_trm * src->get(x,y,z,Stencil::idx[BE]) + omega_w2 * rho[x] * ( vel_trm_TW_BE + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velX[x] + velZ[x];
-               const real_t vel_trm_TE_BW = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TE]) = omega_trm * src->get(x,y,z,Stencil::idx[TE]) + omega_w2 * rho[x] * ( vel_trm_TE_BW + vel );
-               src->get(x,y,z,Stencil::idx[BW]) = omega_trm * src->get(x,y,z,Stencil::idx[BW]) + omega_w2 * rho[x] * ( vel_trm_TE_BW - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] - velZ[x];
-               const real_t vel_trm_TS_BN = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TS]) = omega_trm * src->get(x,y,z,Stencil::idx[TS]) + omega_w2 * rho[x] * ( vel_trm_TS_BN - vel );
-               src->get(x,y,z,Stencil::idx[BN]) = omega_trm * src->get(x,y,z,Stencil::idx[BN]) + omega_w2 * rho[x] * ( vel_trm_TS_BN + vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel = velY[x] + velZ[x];
-               const real_t vel_trm_TN_BS = dir_indep_trm[x] + real_c(1.5) * vel * vel;
-
-               src->get(x,y,z,Stencil::idx[TN]) = omega_trm * src->get(x,y,z,Stencil::idx[TN]) + omega_w2 * rho[x] * ( vel_trm_TN_BS + vel );
-               src->get(x,y,z,Stencil::idx[BS]) = omega_trm * src->get(x,y,z,Stencil::idx[BS]) + omega_w2 * rho[x] * ( vel_trm_TN_BS - vel );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_N_S = dir_indep_trm[x] + real_c(1.5) * velY[x] * velY[x];
-
-               src->get(x,y,z,Stencil::idx[N]) = omega_trm * src->get(x,y,z,Stencil::idx[N]) + omega_w1 * rho[x] * ( vel_trm_N_S + velY[x] );
-               src->get(x,y,z,Stencil::idx[S]) = omega_trm * src->get(x,y,z,Stencil::idx[S]) + omega_w1 * rho[x] * ( vel_trm_N_S - velY[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_E_W = dir_indep_trm[x] + real_c(1.5) * velX[x] * velX[x];
-
-               src->get(x,y,z,Stencil::idx[E]) = omega_trm * src->get(x,y,z,Stencil::idx[E]) + omega_w1 * rho[x] * ( vel_trm_E_W + velX[x] );
-               src->get(x,y,z,Stencil::idx[W]) = omega_trm * src->get(x,y,z,Stencil::idx[W]) + omega_w1 * rho[x] * ( vel_trm_E_W - velX[x] );
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t vel_trm_T_B = dir_indep_trm[x] + real_c(1.5) * velZ[x] * velZ[x];
-
-               src->get(x,y,z,Stencil::idx[T]) = omega_trm * src->get(x,y,z,Stencil::idx[T]) + omega_w1 * rho[x] * ( vel_trm_T_B + velZ[x] );
-               src->get(x,y,z,Stencil::idx[B]) = omega_trm * src->get(x,y,z,Stencil::idx[B]) + omega_w1 * rho[x] * ( vel_trm_T_B - velZ[x] );
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] rho;
-   delete[] dir_indep_trm;
-
-#ifdef _OPENMP
-   }
-#endif
-}
-
-
-
-} // namespace lbm
-} // namespace walberla
-
-#endif
-
diff --git a/src/lbm/trt/SplitPureSweep.impl.h b/src/lbm/trt/SplitPureSweep.impl.h
index 3594863a9..d10ab4087 100644
--- a/src/lbm/trt/SplitPureSweep.impl.h
+++ b/src/lbm/trt/SplitPureSweep.impl.h
@@ -21,8 +21,6 @@
 
 
 
-#ifndef __IBMCPP__
-
 #pragma once
 
 #include "lbm/lattice_model/LatticeModelBase.h"
@@ -1677,10 +1675,3 @@ void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< type
 
 } // namespace lbm
 } // namespace walberla
-
-
-#else // #ifdef __IBMCPP__
-
-#include "lbm/trt/bluegeneq/SplitPureSweep.impl.h"
-
-#endif
diff --git a/src/lbm/trt/bluegeneq/SplitPureSweep.impl.h b/src/lbm/trt/bluegeneq/SplitPureSweep.impl.h
deleted file mode 100644
index 383b697a3..000000000
--- a/src/lbm/trt/bluegeneq/SplitPureSweep.impl.h
+++ /dev/null
@@ -1,2078 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file SplitPureSweep.impl.h
-//! \ingroup lbm
-//! \author Florian Schornbaum <florian.schornbaum@fau.de>
-//
-//======================================================================================================================
-
-
-
-#ifdef __IBMCPP__
-
-#pragma once
-
-#include "lbm/lattice_model/LatticeModelBase.h"
-#include "lbm/lattice_model/D3Q19.h"
-#include "lbm/sweeps/Streaming.h"
-#include "lbm/sweeps/SweepBase.h"
-
-#include <builtins.h>
-#include <type_traits>
-
-// OpenMP
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace walberla {
-namespace lbm {
-
-
-///////////////////////////////////////////////////////
-// Available TRT implementations:                    //
-//                                                   //
-// There are no generic (D*Q*) versions!             //
-//                                                   //
-// Optimized D3Q19 implementation:                   //
-//                     incompressible | compressible //
-//          no forces:       x               x       //
-///////////////////////////////////////////////////////
-
-
-///////////////////////////////
-// Specialization for D3Q19: //
-// - incompressible          //
-// - no additional forces    //
-///////////////////////////////
-
-template< typename LatticeModel_T >
-class SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                               std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                               ! LatticeModel_T::compressible &&
-                                                               std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                               >::type > :
-   public SweepBase< LatticeModel_T >
-{
-public:
-
-   static_assert( (std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value), "Only works with TRT!" );
-   static_assert( (std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value),                       "Only works with D3Q19!" );
-   static_assert( LatticeModel_T::compressible == false,                                                             "Only works with incompressible models!" );
-   static_assert( (std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value),        "Only works without additional forces!" );
-   static_assert( LatticeModel_T::equilibriumAccuracyOrder == 2, "Only works for lattice models that require the equilibrium distribution to be order 2 accurate!" );
-
-   typedef typename SweepBase<LatticeModel_T>::PdfField_T  PdfField_T;
-   typedef typename LatticeModel_T::Stencil                Stencil;
-
-   // block has NO dst pdf field
-   SplitPureSweep( const BlockDataID & pdfField ) :
-      SweepBase<LatticeModel_T>( pdfField ) {}
-
-   // every block has a dedicated dst pdf field
-   SplitPureSweep( const BlockDataID & src, const BlockDataID & dst ) :
-      SweepBase<LatticeModel_T>( src, dst ) {}
-
-   void operator()( IBlock * const block );
-
-   void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-   void collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-};
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-   >::operator()( IBlock * const block )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-   WALBERLA_ASSERT_NOT_NULLPTR( dst );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), 1 );
-
-   // constants used during stream/collide
-
-   const real_t lambda_e =  src->latticeModel().collisionModel().lambda_e();
-   const real_t lambda_d =  src->latticeModel().collisionModel().lambda_d();
-
-   // common prefactors for calculating the equilibrium parts
-   const real_t t0   = real_t(1.0) / real_t(3.0);                 // 1/3      for C
-   const real_t t1x2 = real_t(1.0) / real_t(18.0) * real_t(2.0);  // 1/18 * 2 for N, S, W, E, T, B
-   const real_t t2x2 = real_t(1.0) / real_t(36.0) * real_t(2.0);  // 1/36 * 2 else
-
-   const real_t inv2csq2 = real_t(1.0) / ( real_t(2.0) * ( real_t(1.0) / real_t(3.0) ) * ( real_t(1.0) / real_t(3.0) ) ); //speed of sound related factor for equilibrium distribution function
-   const real_t fac1     = t1x2 * inv2csq2;
-   const real_t fac2     = t2x2 * inv2csq2;
-
-   // relaxation parameter variables
-   const real_t lambda_e_scaled = real_t(0.5) * lambda_e; // 0.5 times the usual value ...
-   const real_t lambda_d_scaled = real_t(0.5) * lambda_d; // ... due to the way of calculations
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* feq_common = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx && dst->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pNE = &src->get(-1, y-1, z  , Stencil::idx[NE]);
-            real_t* pN  = &src->get(0 , y-1, z  , Stencil::idx[N]);
-            real_t* pNW = &src->get(+1, y-1, z  , Stencil::idx[NW]);
-            real_t* pW  = &src->get(+1, y  , z  , Stencil::idx[W]);
-            real_t* pSW = &src->get(+1, y+1, z  , Stencil::idx[SW]);
-            real_t* pS  = &src->get(0 , y+1, z  , Stencil::idx[S]);
-            real_t* pSE = &src->get(-1, y+1, z  , Stencil::idx[SE]);
-            real_t* pE  = &src->get(-1, y  , z  , Stencil::idx[E]);
-            real_t* pT  = &src->get(0 , y  , z-1, Stencil::idx[T]);
-            real_t* pTE = &src->get(-1, y  , z-1, Stencil::idx[TE]);
-            real_t* pTN = &src->get(0 , y-1, z-1, Stencil::idx[TN]);
-            real_t* pTW = &src->get(+1, y  , z-1, Stencil::idx[TW]);
-            real_t* pTS = &src->get(0 , y+1, z-1, Stencil::idx[TS]);
-            real_t* pB  = &src->get(0 , y  , z+1, Stencil::idx[B]);
-            real_t* pBE = &src->get(-1, y  , z+1, Stencil::idx[BE]);
-            real_t* pBN = &src->get(0 , y-1, z+1, Stencil::idx[BN]);
-            real_t* pBW = &src->get(+1, y  , z+1, Stencil::idx[BW]);
-            real_t* pBS = &src->get(0 , y+1, z+1, Stencil::idx[BS]);
-            real_t* pC  = &src->get(0 , y  , z  , Stencil::idx[C]);
-
-            real_t* dC = &dst->get(0,y,z,Stencil::idx[C]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, feq_common  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            __alignx( 32, dC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x];
-               velY[x] = velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x];
-               velZ[x] = velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x];
-
-               feq_common[x] = rho - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dC[x] = pC[x] * (real_t(1.0) - lambda_e) + lambda_e * t0 * feq_common[x];
-            }
-
-            real_t* dNE = &dst->get(0,y,z,Stencil::idx[NE]);
-            real_t* dSW = &dst->get(0,y,z,Stencil::idx[SW]);
-
-            __alignx( 32, dNE );
-            __alignx( 32, dSW );
-
-            #pragma disjoint( *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( pNE[x] + pSW[x] - fac2 * velXPY * velXPY - t2x2 * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( pNE[x] - pSW[x] - real_t(3.0) * t2x2 * velXPY );
-
-               dNE[x] = pNE[x] - sym_NE_SW - asym_NE_SW;
-               dSW[x] = pSW[x] - sym_NE_SW + asym_NE_SW;
-            }
-
-            real_t* dSE = &dst->get(0,y,z,Stencil::idx[SE]);
-            real_t* dNW = &dst->get(0,y,z,Stencil::idx[NW]);
-
-            __alignx( 32, dSE );
-            __alignx( 32, dNW );
-
-            #pragma disjoint( *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( pSE[x] + pNW[x] - fac2 * velXMY * velXMY - t2x2 * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( pSE[x] - pNW[x] - real_t(3.0) * t2x2 * velXMY );
-
-               dSE[x] = pSE[x] - sym_SE_NW - asym_SE_NW;
-               dNW[x] = pNW[x] - sym_SE_NW + asym_SE_NW;
-            }
-
-            real_t* dTE = &dst->get(0,y,z,Stencil::idx[TE]);
-            real_t* dBW = &dst->get(0,y,z,Stencil::idx[BW]);
-
-            __alignx( 32, dTE );
-            __alignx( 32, dBW );
-
-            #pragma disjoint( *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( pTE[x] + pBW[x] - fac2 * velXPZ * velXPZ - t2x2 * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( pTE[x] - pBW[x] - real_t(3.0) * t2x2 * velXPZ );
-
-               dTE[x] = pTE[x] - sym_TE_BW - asym_TE_BW;
-               dBW[x] = pBW[x] - sym_TE_BW + asym_TE_BW;
-            }
-
-            real_t* dBE = &dst->get(0,y,z,Stencil::idx[BE]);
-            real_t* dTW = &dst->get(0,y,z,Stencil::idx[TW]);
-
-            __alignx( 32, dBE );
-            __alignx( 32, dTW );
-
-            #pragma disjoint( *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( pBE[x] + pTW[x] - fac2 * velXMZ * velXMZ - t2x2 * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( pBE[x] - pTW[x] - real_t(3.0) * t2x2 * velXMZ );
-
-               dBE[x] = pBE[x] - sym_BE_TW - asym_BE_TW;
-               dTW[x] = pTW[x] - sym_BE_TW + asym_BE_TW;
-            }
-
-            real_t* dTN = &dst->get(0,y,z,Stencil::idx[TN]);
-            real_t* dBS = &dst->get(0,y,z,Stencil::idx[BS]);
-
-            __alignx( 32, dTN );
-            __alignx( 32, dBS );
-
-            #pragma disjoint( *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( pTN[x] + pBS[x] - fac2 * velYPZ * velYPZ - t2x2 * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( pTN[x] - pBS[x] - real_t(3.0) * t2x2 * velYPZ );
-
-               dTN[x] = pTN[x] - sym_TN_BS - asym_TN_BS;
-               dBS[x] = pBS[x] - sym_TN_BS + asym_TN_BS;
-            }
-
-            real_t* dBN = &dst->get(0,y,z,Stencil::idx[BN]);
-            real_t* dTS = &dst->get(0,y,z,Stencil::idx[TS]);
-
-            __alignx( 32, dBN );
-            __alignx( 32, dTS );
-
-            #pragma disjoint( *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( pBN[x] + pTS[x] - fac2 * velYMZ * velYMZ - t2x2 * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( pBN[x] - pTS[x] - real_t(3.0) * t2x2 * velYMZ );
-
-               dBN[x] = pBN[x] - sym_BN_TS - asym_BN_TS;
-               dTS[x] = pTS[x] - sym_BN_TS + asym_BN_TS;
-            }
-
-            real_t* dN = &dst->get(0,y,z,Stencil::idx[N]);
-            real_t* dS = &dst->get(0,y,z,Stencil::idx[S]);
-
-            __alignx( 32, dN );
-            __alignx( 32, dS );
-
-            #pragma disjoint( *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_N_S = lambda_e_scaled * ( pN[x] + pS[x] - fac1 * velY[x] * velY[x] - t1x2 * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( pN[x] - pS[x] - real_t(3.0) * t1x2 * velY[x] );
-
-               dN[x] = pN[x] - sym_N_S - asym_N_S;
-               dS[x] = pS[x] - sym_N_S + asym_N_S;
-            }
-
-            real_t* dE = &dst->get(0,y,z,Stencil::idx[E]);
-            real_t* dW = &dst->get(0,y,z,Stencil::idx[W]);
-
-            __alignx( 32, dE );
-            __alignx( 32, dW );
-
-            #pragma disjoint( *dE, *dW, *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_E_W = lambda_e_scaled * ( pE[x] + pW[x] - fac1 * velX[x] * velX[x] - t1x2 * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( pE[x] - pW[x] - real_t(3.0) * t1x2 * velX[x] );
-
-               dE[x] = pE[x] - sym_E_W - asym_E_W;
-               dW[x] = pW[x] - sym_E_W + asym_E_W;
-            }
-
-            real_t* dT = &dst->get(0,y,z,Stencil::idx[T]);
-            real_t* dB = &dst->get(0,y,z,Stencil::idx[B]);
-
-            __alignx( 32, dT );
-            __alignx( 32, dB );
-
-            #pragma disjoint( *dT, *dB, *dE, *dW, *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_T_B = lambda_e_scaled * ( pT[x] + pB[x]  - fac1 * velZ[x] * velZ[x] - t1x2 * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( pT[x] - pB[x] - real_t(3.0) * t1x2 * velZ[x] );
-
-               dT[x] = pT[x] - sym_T_B - asym_T_B;
-               dB[x] = pB[x] - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf || dst->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z  , Stencil::idx[NE]);
-               const real_t dd_tmp_N  = src->get(x  , y-1, z  , Stencil::idx[N]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z  , Stencil::idx[NW]);
-               const real_t dd_tmp_W  = src->get(x+1, y  , z  , Stencil::idx[W]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z  , Stencil::idx[SW]);
-               const real_t dd_tmp_S  = src->get(x  , y+1, z  , Stencil::idx[S]);
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z  , Stencil::idx[SE]);
-               const real_t dd_tmp_E  = src->get(x-1, y  , z  , Stencil::idx[E]);
-               const real_t dd_tmp_T  = src->get(x  , y  , z-1, Stencil::idx[T]);
-               const real_t dd_tmp_TE = src->get(x-1, y  , z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_TN = src->get(x  , y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_TW = src->get(x+1, y  , z-1, Stencil::idx[TW]);
-               const real_t dd_tmp_TS = src->get(x  , y+1, z-1, Stencil::idx[TS]);
-               const real_t dd_tmp_B  = src->get(x  , y  , z+1, Stencil::idx[B]);
-               const real_t dd_tmp_BE = src->get(x-1, y  , z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_BN = src->get(x  , y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_BW = src->get(x+1, y  , z+1, Stencil::idx[BW]);
-               const real_t dd_tmp_BS = src->get(x  , y+1, z+1, Stencil::idx[BS]);
-               const real_t dd_tmp_C  = src->get(x  , y  , z  , Stencil::idx[C]);
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW;
-               velY[x] = velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS;
-               velZ[x] = velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE;
-
-               feq_common[x] = rho - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dst->get( x, y, z, Stencil::idx[C] ) = dd_tmp_C * (real_t(1.0) - lambda_e) + lambda_e * t0 * feq_common[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z, Stencil::idx[NE]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z, Stencil::idx[SW]);
-
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( dd_tmp_NE + dd_tmp_SW - fac2 * velXPY * velXPY - t2x2 * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( dd_tmp_NE - dd_tmp_SW - real_t(3.0) * t2x2 * velXPY );
-
-               dst->get( x, y, z, Stencil::idx[NE] ) = dd_tmp_NE - sym_NE_SW - asym_NE_SW;
-               dst->get( x, y, z, Stencil::idx[SW] ) = dd_tmp_SW - sym_NE_SW + asym_NE_SW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z, Stencil::idx[SE]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z, Stencil::idx[NW]);
-
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( dd_tmp_SE + dd_tmp_NW - fac2 * velXMY * velXMY - t2x2 * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( dd_tmp_SE - dd_tmp_NW - real_t(3.0) * t2x2 * velXMY );
-
-               dst->get( x, y, z, Stencil::idx[SE] ) = dd_tmp_SE - sym_SE_NW - asym_SE_NW;
-               dst->get( x, y, z, Stencil::idx[NW] ) = dd_tmp_NW - sym_SE_NW + asym_SE_NW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TE = src->get(x-1, y, z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_BW = src->get(x+1, y, z+1, Stencil::idx[BW]);
-
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( dd_tmp_TE + dd_tmp_BW - fac2 * velXPZ * velXPZ - t2x2 * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( dd_tmp_TE - dd_tmp_BW - real_t(3.0) * t2x2 * velXPZ );
-
-               dst->get( x, y, z, Stencil::idx[TE] ) = dd_tmp_TE - sym_TE_BW - asym_TE_BW;
-               dst->get( x, y, z, Stencil::idx[BW] ) = dd_tmp_BW - sym_TE_BW + asym_TE_BW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BE = src->get(x-1, y, z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_TW = src->get(x+1, y, z-1, Stencil::idx[TW]);
-
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( dd_tmp_BE + dd_tmp_TW - fac2 * velXMZ * velXMZ - t2x2 * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( dd_tmp_BE - dd_tmp_TW - real_t(3.0) * t2x2 * velXMZ );
-
-               dst->get( x, y, z, Stencil::idx[BE] ) = dd_tmp_BE - sym_BE_TW - asym_BE_TW;
-               dst->get( x, y, z, Stencil::idx[TW] ) = dd_tmp_TW - sym_BE_TW + asym_BE_TW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TN = src->get(x, y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_BS = src->get(x, y+1, z+1, Stencil::idx[BS]);
-
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( dd_tmp_TN + dd_tmp_BS - fac2 * velYPZ * velYPZ - t2x2 * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( dd_tmp_TN - dd_tmp_BS - real_t(3.0) * t2x2 * velYPZ );
-
-               dst->get( x, y, z, Stencil::idx[TN] ) = dd_tmp_TN - sym_TN_BS - asym_TN_BS;
-               dst->get( x, y, z, Stencil::idx[BS] ) = dd_tmp_BS - sym_TN_BS + asym_TN_BS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BN = src->get(x, y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_TS = src->get(x, y+1, z-1, Stencil::idx[TS]);
-
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( dd_tmp_BN + dd_tmp_TS - fac2 * velYMZ * velYMZ - t2x2 * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( dd_tmp_BN - dd_tmp_TS - real_t(3.0) * t2x2 * velYMZ );
-
-               dst->get( x, y, z, Stencil::idx[BN] ) = dd_tmp_BN - sym_BN_TS - asym_BN_TS;
-               dst->get( x, y, z, Stencil::idx[TS] ) = dd_tmp_TS - sym_BN_TS + asym_BN_TS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_N  = src->get(x, y-1, z, Stencil::idx[N]);
-               const real_t dd_tmp_S  = src->get(x, y+1, z, Stencil::idx[S]);
-
-               const real_t  sym_N_S = lambda_e_scaled * ( dd_tmp_N + dd_tmp_S - fac1 * velY[x] * velY[x] - t1x2 * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( dd_tmp_N - dd_tmp_S - real_t(3.0) * t1x2 * velY[x] );
-
-               dst->get( x, y, z, Stencil::idx[N] ) = dd_tmp_N - sym_N_S - asym_N_S;
-               dst->get( x, y, z, Stencil::idx[S] ) = dd_tmp_S - sym_N_S + asym_N_S;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_E  = src->get(x-1, y, z, Stencil::idx[E]);
-               const real_t dd_tmp_W  = src->get(x+1, y, z, Stencil::idx[W]);
-
-               const real_t  sym_E_W = lambda_e_scaled * ( dd_tmp_E + dd_tmp_W - fac1 * velX[x] * velX[x] - t1x2 * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( dd_tmp_E - dd_tmp_W - real_t(3.0) * t1x2 * velX[x] );
-
-               dst->get( x, y, z, Stencil::idx[E] ) = dd_tmp_E - sym_E_W - asym_E_W;
-               dst->get( x, y, z, Stencil::idx[W] ) = dd_tmp_W - sym_E_W + asym_E_W;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_T  = src->get(x, y, z-1, Stencil::idx[T]);
-               const real_t dd_tmp_B  = src->get(x, y, z+1, Stencil::idx[B]);
-
-               const real_t  sym_T_B = lambda_e_scaled * ( dd_tmp_T + dd_tmp_B  - fac1 * velZ[x] * velZ[x] - t1x2 * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( dd_tmp_T - dd_tmp_B - real_t(3.0) * t1x2 * velZ[x] );
-
-               dst->get( x, y, z, Stencil::idx[T] ) = dd_tmp_T - sym_T_B - asym_T_B;
-               dst->get( x, y, z, Stencil::idx[B] ) = dd_tmp_B - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] feq_common;
-
-#ifdef _OPENMP
-   }
-#endif
-
-   src->swapDataPointers( dst );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-   >::stream( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   StreamEverything< LatticeModel_T >::execute( src, dst, numberOfGhostLayersToInclude );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              ! LatticeModel_T::compressible >::value &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-#ifdef NDEBUG
-   >::collide( IBlock * const block, const uint_t /*numberOfGhostLayersToInclude*/ )
-#else
-   >::collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-#endif
-{
-   WALBERLA_ASSERT_EQUAL( numberOfGhostLayersToInclude, uint_t(0) ); // the implementation right now doesn't support inclusion of ghost layers in collide step!
-
-   PdfField_T * src = this->getSrcField( block );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), numberOfGhostLayersToInclude );
-
-   // constants used during stream/collide
-
-   const real_t lambda_e =  src->latticeModel().collisionModel().lambda_e();
-   const real_t lambda_d =  src->latticeModel().collisionModel().lambda_d();
-
-   // common prefactors for calculating the equilibrium parts
-   const real_t t0   = real_t(1.0) / real_t(3.0);                 // 1/3      for C
-   const real_t t1x2 = real_t(1.0) / real_t(18.0) * real_t(2.0);  // 1/18 * 2 for N, S, W, E, T, B
-   const real_t t2x2 = real_t(1.0) / real_t(36.0) * real_t(2.0);  // 1/36 * 2 else
-
-   const real_t inv2csq2 = real_t(1.0) / ( real_t(2.0) * ( real_t(1.0) / real_t(3.0) ) * ( real_t(1.0) / real_t(3.0) ) ); //speed of sound related factor for equilibrium distribution function
-   const real_t fac1     = t1x2 * inv2csq2;
-   const real_t fac2     = t2x2 * inv2csq2;
-
-   // relaxation parameter variables
-   const real_t lambda_e_scaled = real_t(0.5) * lambda_e; // 0.5 times the usual value ...
-   const real_t lambda_d_scaled = real_t(0.5) * lambda_d; // ... due to the way of calculations
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* feq_common = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pC  = &src->get( 0, y, z, Stencil::idx[C]);
-            real_t* pN  = &src->get( 0, y, z, Stencil::idx[N]);
-            real_t* pS  = &src->get( 0, y, z, Stencil::idx[S]);
-            real_t* pW  = &src->get( 0, y, z, Stencil::idx[W]);
-            real_t* pE  = &src->get( 0, y, z, Stencil::idx[E]);
-            real_t* pT  = &src->get( 0, y, z, Stencil::idx[T]);
-            real_t* pB  = &src->get( 0, y, z, Stencil::idx[B]);
-            real_t* pNW = &src->get( 0, y, z, Stencil::idx[NW]);
-            real_t* pNE = &src->get( 0, y, z, Stencil::idx[NE]);
-            real_t* pSW = &src->get( 0, y, z, Stencil::idx[SW]);
-            real_t* pSE = &src->get( 0, y, z, Stencil::idx[SE]);
-            real_t* pTN = &src->get( 0, y, z, Stencil::idx[TN]);
-            real_t* pTS = &src->get( 0, y, z, Stencil::idx[TS]);
-            real_t* pTW = &src->get( 0, y, z, Stencil::idx[TW]);
-            real_t* pTE = &src->get( 0, y, z, Stencil::idx[TE]);
-            real_t* pBN = &src->get( 0, y, z, Stencil::idx[BN]);
-            real_t* pBS = &src->get( 0, y, z, Stencil::idx[BS]);
-            real_t* pBW = &src->get( 0, y, z, Stencil::idx[BW]);
-            real_t* pBE = &src->get( 0, y, z, Stencil::idx[BE]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, feq_common  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, \
-                              t0, t1x2, t2x2, fac1, fac2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x];
-               velY[x] = velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x];
-               velZ[x] = velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x];
-
-               feq_common[x] = rho - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               pC[x] = pC[x] * (real_t(1.0) - lambda_e) + lambda_e * t0 * feq_common[x];
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( pNE[x] + pSW[x] - fac2 * velXPY * velXPY - t2x2 * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( pNE[x] - pSW[x] - real_t(3.0) * t2x2 * velXPY );
-
-               pNE[x] = pNE[x] - sym_NE_SW - asym_NE_SW;
-               pSW[x] = pSW[x] - sym_NE_SW + asym_NE_SW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( pSE[x] + pNW[x] - fac2 * velXMY * velXMY - t2x2 * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( pSE[x] - pNW[x] - real_t(3.0) * t2x2 * velXMY );
-
-               pSE[x] = pSE[x] - sym_SE_NW - asym_SE_NW;
-               pNW[x] = pNW[x] - sym_SE_NW + asym_SE_NW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( pTE[x] + pBW[x] - fac2 * velXPZ * velXPZ - t2x2 * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( pTE[x] - pBW[x] - real_t(3.0) * t2x2 * velXPZ );
-
-               pTE[x] = pTE[x] - sym_TE_BW - asym_TE_BW;
-               pBW[x] = pBW[x] - sym_TE_BW + asym_TE_BW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( pBE[x] + pTW[x] - fac2 * velXMZ * velXMZ - t2x2 * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( pBE[x] - pTW[x] - real_t(3.0) * t2x2 * velXMZ );
-
-               pBE[x] = pBE[x] - sym_BE_TW - asym_BE_TW;
-               pTW[x] = pTW[x] - sym_BE_TW + asym_BE_TW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( pTN[x] + pBS[x] - fac2 * velYPZ * velYPZ - t2x2 * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( pTN[x] - pBS[x] - real_t(3.0) * t2x2 * velYPZ );
-
-               pTN[x] = pTN[x] - sym_TN_BS - asym_TN_BS;
-               pBS[x] = pBS[x] - sym_TN_BS + asym_TN_BS;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( pBN[x] + pTS[x] - fac2 * velYMZ * velYMZ - t2x2 * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( pBN[x] - pTS[x] - real_t(3.0) * t2x2 * velYMZ );
-
-               pBN[x] = pBN[x] - sym_BN_TS - asym_BN_TS;
-               pTS[x] = pTS[x] - sym_BN_TS + asym_BN_TS;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_N_S = lambda_e_scaled * ( pN[x] + pS[x] - fac1 * velY[x] * velY[x] - t1x2 * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( pN[x] - pS[x] - real_t(3.0) * t1x2 * velY[x] );
-
-               pN[x] = pN[x] - sym_N_S - asym_N_S;
-               pS[x] = pS[x] - sym_N_S + asym_N_S;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_E_W = lambda_e_scaled * ( pE[x] + pW[x] - fac1 * velX[x] * velX[x] - t1x2 * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( pE[x] - pW[x] - real_t(3.0) * t1x2 * velX[x] );
-
-               pE[x] = pE[x] - sym_E_W - asym_E_W;
-               pW[x] = pW[x] - sym_E_W + asym_E_W;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_T_B = lambda_e_scaled * ( pT[x] + pB[x]  - fac1 * velZ[x] * velZ[x] - t1x2 * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( pT[x] - pB[x] - real_t(3.0) * t1x2 * velZ[x] );
-
-               pT[x] = pT[x] - sym_T_B - asym_T_B;
-               pB[x] = pB[x] - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_C  = src->get( x, y, z, Stencil::idx[C]  );
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]  );
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]  );
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]  );
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]  );
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]  );
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]  );
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW] );
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE] );
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW] );
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE] );
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN] );
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS] );
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW] );
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE] );
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN] );
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS] );
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW] );
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE] );
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-
-               velX[x] = velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW;
-               velY[x] = velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS;
-               velZ[x] = velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE;
-
-               feq_common[x] = rho - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               src->get( x, y, z, Stencil::idx[C] ) = dd_tmp_C * (real_t(1.0) - lambda_e) + lambda_e * t0 * feq_common[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE]);
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW]);
-
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( dd_tmp_NE + dd_tmp_SW - fac2 * velXPY * velXPY - t2x2 * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( dd_tmp_NE - dd_tmp_SW - real_t(3.0) * t2x2 * velXPY );
-
-               src->get( x, y, z, Stencil::idx[NE] ) = dd_tmp_NE - sym_NE_SW - asym_NE_SW;
-               src->get( x, y, z, Stencil::idx[SW] ) = dd_tmp_SW - sym_NE_SW + asym_NE_SW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE]);
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW]);
-
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( dd_tmp_SE + dd_tmp_NW - fac2 * velXMY * velXMY - t2x2 * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( dd_tmp_SE - dd_tmp_NW - real_t(3.0) * t2x2 * velXMY );
-
-               src->get( x, y, z, Stencil::idx[SE] ) = dd_tmp_SE - sym_SE_NW - asym_SE_NW;
-               src->get( x, y, z, Stencil::idx[NW] ) = dd_tmp_NW - sym_SE_NW + asym_SE_NW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE]);
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW]);
-
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( dd_tmp_TE + dd_tmp_BW - fac2 * velXPZ * velXPZ - t2x2 * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( dd_tmp_TE - dd_tmp_BW - real_t(3.0) * t2x2 * velXPZ );
-
-               src->get( x, y, z, Stencil::idx[TE] ) = dd_tmp_TE - sym_TE_BW - asym_TE_BW;
-               src->get( x, y, z, Stencil::idx[BW] ) = dd_tmp_BW - sym_TE_BW + asym_TE_BW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE]);
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW]);
-
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( dd_tmp_BE + dd_tmp_TW - fac2 * velXMZ * velXMZ - t2x2 * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( dd_tmp_BE - dd_tmp_TW - real_t(3.0) * t2x2 * velXMZ );
-
-               src->get( x, y, z, Stencil::idx[BE] ) = dd_tmp_BE - sym_BE_TW - asym_BE_TW;
-               src->get( x, y, z, Stencil::idx[TW] ) = dd_tmp_TW - sym_BE_TW + asym_BE_TW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN]);
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS]);
-
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( dd_tmp_TN + dd_tmp_BS - fac2 * velYPZ * velYPZ - t2x2 * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( dd_tmp_TN - dd_tmp_BS - real_t(3.0) * t2x2 * velYPZ );
-
-               src->get( x, y, z, Stencil::idx[TN] ) = dd_tmp_TN - sym_TN_BS - asym_TN_BS;
-               src->get( x, y, z, Stencil::idx[BS] ) = dd_tmp_BS - sym_TN_BS + asym_TN_BS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN]);
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS]);
-
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( dd_tmp_BN + dd_tmp_TS - fac2 * velYMZ * velYMZ - t2x2 * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( dd_tmp_BN - dd_tmp_TS - real_t(3.0) * t2x2 * velYMZ );
-
-               src->get( x, y, z, Stencil::idx[BN] ) = dd_tmp_BN - sym_BN_TS - asym_BN_TS;
-               src->get( x, y, z, Stencil::idx[TS] ) = dd_tmp_TS - sym_BN_TS + asym_BN_TS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]);
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]);
-
-               const real_t  sym_N_S = lambda_e_scaled * ( dd_tmp_N + dd_tmp_S - fac1 * velY[x] * velY[x] - t1x2 * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( dd_tmp_N - dd_tmp_S - real_t(3.0) * t1x2 * velY[x] );
-
-               src->get( x, y, z, Stencil::idx[N] ) = dd_tmp_N - sym_N_S - asym_N_S;
-               src->get( x, y, z, Stencil::idx[S] ) = dd_tmp_S - sym_N_S + asym_N_S;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]);
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]);
-
-               const real_t  sym_E_W = lambda_e_scaled * ( dd_tmp_E + dd_tmp_W - fac1 * velX[x] * velX[x] - t1x2 * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( dd_tmp_E - dd_tmp_W - real_t(3.0) * t1x2 * velX[x] );
-
-               src->get( x, y, z, Stencil::idx[E] ) = dd_tmp_E - sym_E_W - asym_E_W;
-               src->get( x, y, z, Stencil::idx[W] ) = dd_tmp_W - sym_E_W + asym_E_W;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]);
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]);
-
-               const real_t  sym_T_B = lambda_e_scaled * ( dd_tmp_T + dd_tmp_B  - fac1 * velZ[x] * velZ[x] - t1x2 * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( dd_tmp_T - dd_tmp_B - real_t(3.0) * t1x2 * velZ[x] );
-
-               src->get( x, y, z, Stencil::idx[T] ) = dd_tmp_T - sym_T_B - asym_T_B;
-               src->get( x, y, z, Stencil::idx[B] ) = dd_tmp_B - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] feq_common;
-
-#ifdef _OPENMP
-   }
-#endif
-}
-
-
-
-///////////////////////////////
-// Specialization for D3Q19: //
-// - compressible            //
-// - no additional forces    //
-///////////////////////////////
-
-template< typename LatticeModel_T >
-class SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                               std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                               LatticeModel_T::compressible &&
-                                                               std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                               >::type > :
-   public SweepBase< LatticeModel_T >
-{
-public:
-
-   static_assert( (std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value), "Only works with TRT!" );
-   static_assert( (std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value),                       "Only works with D3Q19!" );
-   static_assert( LatticeModel_T::compressible,                                                                      "Only works with compressible models!" );
-   static_assert( (std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value),        "Only works without additional forces!" );
-   static_assert( LatticeModel_T::equilibriumAccuracyOrder == 2, "Only works for lattice models that require the equilibrium distribution to be order 2 accurate!" );
-
-   typedef typename SweepBase<LatticeModel_T>::PdfField_T  PdfField_T;
-   typedef typename LatticeModel_T::Stencil                Stencil;
-
-   // block has NO dst pdf field
-   SplitPureSweep( const BlockDataID & pdfField ) :
-      SweepBase<LatticeModel_T>( pdfField ) {}
-
-   // every block has a dedicated dst pdf field
-   SplitPureSweep( const BlockDataID & src, const BlockDataID & dst ) :
-      SweepBase<LatticeModel_T>( src, dst ) {}
-
-   void operator()( IBlock * const block );
-
-   void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-   void collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
-};
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              LatticeModel_T::compressible,
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                              >::type
-   >::operator()( IBlock * const block )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-   WALBERLA_ASSERT_NOT_NULLPTR( dst );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), 1 );
-
-   // constants used during stream/collide
-
-   const real_t lambda_e =  src->latticeModel().collisionModel().lambda_e();
-   const real_t lambda_d =  src->latticeModel().collisionModel().lambda_d();
-
-   // common prefactors for calculating the equilibrium parts
-   const real_t t0_0   = real_t(1.0) / real_t(3.0);                 // 1/3      for C
-   const real_t t1x2_0 = real_t(1.0) / real_t(18.0) * real_t(2.0);  // 1/18 * 2 for N, S, W, E, T, B
-   const real_t t2x2_0 = real_t(1.0) / real_t(36.0) * real_t(2.0);  // 1/36 * 2 else
-
-   const real_t inv2csq2 = real_t(1.0) / ( real_t(2.0) * ( real_t(1.0) / real_t(3.0) ) * ( real_t(1.0) / real_t(3.0) ) ); //speed of sound related factor for equilibrium distribution function
-
-   // relaxation parameter variables
-   const real_t lambda_e_scaled = real_t(0.5) * lambda_e; // 0.5 times the usual value ...
-   const real_t lambda_d_scaled = real_t(0.5) * lambda_d; // ... due to the way of calculations
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* t1x2 = new real_t[ uint_c(xSize) ];
-   real_t* t2x2 = new real_t[ uint_c(xSize) ];
-   real_t* fac1 = new real_t[ uint_c(xSize) ];
-   real_t* fac2 = new real_t[ uint_c(xSize) ];
-
-   real_t* feq_common = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx && dst->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pNE = &src->get(-1, y-1, z  , Stencil::idx[NE]);
-            real_t* pN  = &src->get(0 , y-1, z  , Stencil::idx[N]);
-            real_t* pNW = &src->get(+1, y-1, z  , Stencil::idx[NW]);
-            real_t* pW  = &src->get(+1, y  , z  , Stencil::idx[W]);
-            real_t* pSW = &src->get(+1, y+1, z  , Stencil::idx[SW]);
-            real_t* pS  = &src->get(0 , y+1, z  , Stencil::idx[S]);
-            real_t* pSE = &src->get(-1, y+1, z  , Stencil::idx[SE]);
-            real_t* pE  = &src->get(-1, y  , z  , Stencil::idx[E]);
-            real_t* pT  = &src->get(0 , y  , z-1, Stencil::idx[T]);
-            real_t* pTE = &src->get(-1, y  , z-1, Stencil::idx[TE]);
-            real_t* pTN = &src->get(0 , y-1, z-1, Stencil::idx[TN]);
-            real_t* pTW = &src->get(+1, y  , z-1, Stencil::idx[TW]);
-            real_t* pTS = &src->get(0 , y+1, z-1, Stencil::idx[TS]);
-            real_t* pB  = &src->get(0 , y  , z+1, Stencil::idx[B]);
-            real_t* pBE = &src->get(-1, y  , z+1, Stencil::idx[BE]);
-            real_t* pBN = &src->get(0 , y-1, z+1, Stencil::idx[BN]);
-            real_t* pBW = &src->get(+1, y  , z+1, Stencil::idx[BW]);
-            real_t* pBS = &src->get(0 , y+1, z+1, Stencil::idx[BS]);
-            real_t* pC  = &src->get(0 , y  , z  , Stencil::idx[C]);
-
-            real_t* dC = &dst->get(0,y,z,Stencil::idx[C]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, t1x2 );
-            __alignx( 32, t2x2 );
-            __alignx( 32, fac1 );
-            __alignx( 32, fac2 );
-            __alignx( 32, feq_common  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            __alignx( 32, dC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-               const real_t invRho = real_t(1.0) / rho;
-
-               velX[x] = invRho * ( velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x] );
-               velY[x] = invRho * ( velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x] );
-               velZ[x] = invRho * ( velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x] );
-
-               t1x2[x] = t1x2_0 * rho;
-               t2x2[x] = t2x2_0 * rho;
-               fac1[x] = t1x2_0 * rho * inv2csq2;
-               fac2[x] = t2x2_0 * rho * inv2csq2;
-
-               feq_common[x] = real_t(1.0) - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dC[x] = pC[x] * (real_t(1.0) - lambda_e) + lambda_e * t0_0 * rho * feq_common[x];
-            }
-
-            real_t* dNE = &dst->get(0,y,z,Stencil::idx[NE]);
-            real_t* dSW = &dst->get(0,y,z,Stencil::idx[SW]);
-
-            __alignx( 32, dNE );
-            __alignx( 32, dSW );
-
-            #pragma disjoint( *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( pNE[x] + pSW[x] - fac2[x] * velXPY * velXPY - t2x2[x] * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( pNE[x] - pSW[x] - real_t(3.0) * t2x2[x] * velXPY );
-
-               dNE[x] = pNE[x] - sym_NE_SW - asym_NE_SW;
-               dSW[x] = pSW[x] - sym_NE_SW + asym_NE_SW;
-            }
-
-            real_t* dSE = &dst->get(0,y,z,Stencil::idx[SE]);
-            real_t* dNW = &dst->get(0,y,z,Stencil::idx[NW]);
-
-            __alignx( 32, dSE );
-            __alignx( 32, dNW );
-
-            #pragma disjoint( *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( pSE[x] + pNW[x] - fac2[x] * velXMY * velXMY - t2x2[x] * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( pSE[x] - pNW[x] - real_t(3.0) * t2x2[x] * velXMY );
-
-               dSE[x] = pSE[x] - sym_SE_NW - asym_SE_NW;
-               dNW[x] = pNW[x] - sym_SE_NW + asym_SE_NW;
-            }
-
-            real_t* dTE = &dst->get(0,y,z,Stencil::idx[TE]);
-            real_t* dBW = &dst->get(0,y,z,Stencil::idx[BW]);
-
-            __alignx( 32, dTE );
-            __alignx( 32, dBW );
-
-            #pragma disjoint( *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( pTE[x] + pBW[x] - fac2[x] * velXPZ * velXPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( pTE[x] - pBW[x] - real_t(3.0) * t2x2[x] * velXPZ );
-
-               dTE[x] = pTE[x] - sym_TE_BW - asym_TE_BW;
-               dBW[x] = pBW[x] - sym_TE_BW + asym_TE_BW;
-            }
-
-            real_t* dBE = &dst->get(0,y,z,Stencil::idx[BE]);
-            real_t* dTW = &dst->get(0,y,z,Stencil::idx[TW]);
-
-            __alignx( 32, dBE );
-            __alignx( 32, dTW );
-
-            #pragma disjoint( *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( pBE[x] + pTW[x] - fac2[x] * velXMZ * velXMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( pBE[x] - pTW[x] - real_t(3.0) * t2x2[x] * velXMZ );
-
-               dBE[x] = pBE[x] - sym_BE_TW - asym_BE_TW;
-               dTW[x] = pTW[x] - sym_BE_TW + asym_BE_TW;
-            }
-
-            real_t* dTN = &dst->get(0,y,z,Stencil::idx[TN]);
-            real_t* dBS = &dst->get(0,y,z,Stencil::idx[BS]);
-
-            __alignx( 32, dTN );
-            __alignx( 32, dBS );
-
-            #pragma disjoint( *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( pTN[x] + pBS[x] - fac2[x] * velYPZ * velYPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( pTN[x] - pBS[x] - real_t(3.0) * t2x2[x] * velYPZ );
-
-               dTN[x] = pTN[x] - sym_TN_BS - asym_TN_BS;
-               dBS[x] = pBS[x] - sym_TN_BS + asym_TN_BS;
-            }
-
-            real_t* dBN = &dst->get(0,y,z,Stencil::idx[BN]);
-            real_t* dTS = &dst->get(0,y,z,Stencil::idx[TS]);
-
-            __alignx( 32, dBN );
-            __alignx( 32, dTS );
-
-            #pragma disjoint( *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( pBN[x] + pTS[x] - fac2[x] * velYMZ * velYMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( pBN[x] - pTS[x] - real_t(3.0) * t2x2[x] * velYMZ );
-
-               dBN[x] = pBN[x] - sym_BN_TS - asym_BN_TS;
-               dTS[x] = pTS[x] - sym_BN_TS + asym_BN_TS;
-            }
-
-            real_t* dN = &dst->get(0,y,z,Stencil::idx[N]);
-            real_t* dS = &dst->get(0,y,z,Stencil::idx[S]);
-
-            __alignx( 32, dN );
-            __alignx( 32, dS );
-
-            #pragma disjoint( *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_N_S = lambda_e_scaled * ( pN[x] + pS[x] - fac1[x] * velY[x] * velY[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( pN[x] - pS[x] - real_t(3.0) * t1x2[x] * velY[x] );
-
-               dN[x] = pN[x] - sym_N_S - asym_N_S;
-               dS[x] = pS[x] - sym_N_S + asym_N_S;
-            }
-
-            real_t* dE = &dst->get(0,y,z,Stencil::idx[E]);
-            real_t* dW = &dst->get(0,y,z,Stencil::idx[W]);
-
-            __alignx( 32, dE );
-            __alignx( 32, dW );
-
-            #pragma disjoint( *dE, *dW, *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_E_W = lambda_e_scaled * ( pE[x] + pW[x] - fac1[x] * velX[x] * velX[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( pE[x] - pW[x] - real_t(3.0) * t1x2[x] * velX[x] );
-
-               dE[x] = pE[x] - sym_E_W - asym_E_W;
-               dW[x] = pW[x] - sym_E_W + asym_E_W;
-            }
-
-            real_t* dT = &dst->get(0,y,z,Stencil::idx[T]);
-            real_t* dB = &dst->get(0,y,z,Stencil::idx[B]);
-
-            __alignx( 32, dT );
-            __alignx( 32, dB );
-
-            #pragma disjoint( *dT, *dB, *dE, *dW, *dN, *dS, *dBN, *dTS, *dTN, *dBS, *dBE, *dTW, *dTE, *dBW, *dSE, *dNW, *dNE, *dSW, \
-                              *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, *dC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_T_B = lambda_e_scaled * ( pT[x] + pB[x]  - fac1[x] * velZ[x] * velZ[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( pT[x] - pB[x] - real_t(3.0) * t1x2[x] * velZ[x] );
-
-               dT[x] = pT[x] - sym_T_B - asym_T_B;
-               dB[x] = pB[x] - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf || dst->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z  , Stencil::idx[NE]);
-               const real_t dd_tmp_N  = src->get(x  , y-1, z  , Stencil::idx[N]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z  , Stencil::idx[NW]);
-               const real_t dd_tmp_W  = src->get(x+1, y  , z  , Stencil::idx[W]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z  , Stencil::idx[SW]);
-               const real_t dd_tmp_S  = src->get(x  , y+1, z  , Stencil::idx[S]);
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z  , Stencil::idx[SE]);
-               const real_t dd_tmp_E  = src->get(x-1, y  , z  , Stencil::idx[E]);
-               const real_t dd_tmp_T  = src->get(x  , y  , z-1, Stencil::idx[T]);
-               const real_t dd_tmp_TE = src->get(x-1, y  , z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_TN = src->get(x  , y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_TW = src->get(x+1, y  , z-1, Stencil::idx[TW]);
-               const real_t dd_tmp_TS = src->get(x  , y+1, z-1, Stencil::idx[TS]);
-               const real_t dd_tmp_B  = src->get(x  , y  , z+1, Stencil::idx[B]);
-               const real_t dd_tmp_BE = src->get(x-1, y  , z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_BN = src->get(x  , y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_BW = src->get(x+1, y  , z+1, Stencil::idx[BW]);
-               const real_t dd_tmp_BS = src->get(x  , y+1, z+1, Stencil::idx[BS]);
-               const real_t dd_tmp_C  = src->get(x  , y  , z  , Stencil::idx[C]);
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-               const real_t invRho = real_t(1.0) / rho;
-
-               velX[x] = invRho * ( velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW );
-               velY[x] = invRho * ( velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS );
-               velZ[x] = invRho * ( velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE );
-
-               t1x2[x] = t1x2_0 * rho;
-               t2x2[x] = t2x2_0 * rho;
-               fac1[x] = t1x2_0 * rho * inv2csq2;
-               fac2[x] = t2x2_0 * rho * inv2csq2;
-
-               feq_common[x] = real_t(1.0) - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               dst->get( x, y, z, Stencil::idx[C] ) = dd_tmp_C * (real_t(1.0) - lambda_e) + lambda_e * t0_0 * rho * feq_common[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get(x-1, y-1, z, Stencil::idx[NE]);
-               const real_t dd_tmp_SW = src->get(x+1, y+1, z, Stencil::idx[SW]);
-
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( dd_tmp_NE + dd_tmp_SW - fac2[x] * velXPY * velXPY - t2x2[x] * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( dd_tmp_NE - dd_tmp_SW - real_t(3.0) * t2x2[x] * velXPY );
-
-               dst->get( x, y, z, Stencil::idx[NE] ) = dd_tmp_NE - sym_NE_SW - asym_NE_SW;
-               dst->get( x, y, z, Stencil::idx[SW] ) = dd_tmp_SW - sym_NE_SW + asym_NE_SW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_SE = src->get(x-1, y+1, z, Stencil::idx[SE]);
-               const real_t dd_tmp_NW = src->get(x+1, y-1, z, Stencil::idx[NW]);
-
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( dd_tmp_SE + dd_tmp_NW - fac2[x] * velXMY * velXMY - t2x2[x] * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( dd_tmp_SE - dd_tmp_NW - real_t(3.0) * t2x2[x] * velXMY );
-
-               dst->get( x, y, z, Stencil::idx[SE] ) = dd_tmp_SE - sym_SE_NW - asym_SE_NW;
-               dst->get( x, y, z, Stencil::idx[NW] ) = dd_tmp_NW - sym_SE_NW + asym_SE_NW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TE = src->get(x-1, y, z-1, Stencil::idx[TE]);
-               const real_t dd_tmp_BW = src->get(x+1, y, z+1, Stencil::idx[BW]);
-
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( dd_tmp_TE + dd_tmp_BW - fac2[x] * velXPZ * velXPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( dd_tmp_TE - dd_tmp_BW - real_t(3.0) * t2x2[x] * velXPZ );
-
-               dst->get( x, y, z, Stencil::idx[TE] ) = dd_tmp_TE - sym_TE_BW - asym_TE_BW;
-               dst->get( x, y, z, Stencil::idx[BW] ) = dd_tmp_BW - sym_TE_BW + asym_TE_BW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BE = src->get(x-1, y, z+1, Stencil::idx[BE]);
-               const real_t dd_tmp_TW = src->get(x+1, y, z-1, Stencil::idx[TW]);
-
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( dd_tmp_BE + dd_tmp_TW - fac2[x] * velXMZ * velXMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( dd_tmp_BE - dd_tmp_TW - real_t(3.0) * t2x2[x] * velXMZ );
-
-               dst->get( x, y, z, Stencil::idx[BE] ) = dd_tmp_BE - sym_BE_TW - asym_BE_TW;
-               dst->get( x, y, z, Stencil::idx[TW] ) = dd_tmp_TW - sym_BE_TW + asym_BE_TW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TN = src->get(x, y-1, z-1, Stencil::idx[TN]);
-               const real_t dd_tmp_BS = src->get(x, y+1, z+1, Stencil::idx[BS]);
-
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( dd_tmp_TN + dd_tmp_BS - fac2[x] * velYPZ * velYPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( dd_tmp_TN - dd_tmp_BS - real_t(3.0) * t2x2[x] * velYPZ );
-
-               dst->get( x, y, z, Stencil::idx[TN] ) = dd_tmp_TN - sym_TN_BS - asym_TN_BS;
-               dst->get( x, y, z, Stencil::idx[BS] ) = dd_tmp_BS - sym_TN_BS + asym_TN_BS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BN = src->get(x, y-1, z+1, Stencil::idx[BN]);
-               const real_t dd_tmp_TS = src->get(x, y+1, z-1, Stencil::idx[TS]);
-
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( dd_tmp_BN + dd_tmp_TS - fac2[x] * velYMZ * velYMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( dd_tmp_BN - dd_tmp_TS - real_t(3.0) * t2x2[x] * velYMZ );
-
-               dst->get( x, y, z, Stencil::idx[BN] ) = dd_tmp_BN - sym_BN_TS - asym_BN_TS;
-               dst->get( x, y, z, Stencil::idx[TS] ) = dd_tmp_TS - sym_BN_TS + asym_BN_TS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_N  = src->get(x, y-1, z, Stencil::idx[N]);
-               const real_t dd_tmp_S  = src->get(x, y+1, z, Stencil::idx[S]);
-
-               const real_t  sym_N_S = lambda_e_scaled * ( dd_tmp_N + dd_tmp_S - fac1[x] * velY[x] * velY[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( dd_tmp_N - dd_tmp_S - real_t(3.0) * t1x2[x] * velY[x] );
-
-               dst->get( x, y, z, Stencil::idx[N] ) = dd_tmp_N - sym_N_S - asym_N_S;
-               dst->get( x, y, z, Stencil::idx[S] ) = dd_tmp_S - sym_N_S + asym_N_S;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_E  = src->get(x-1, y, z, Stencil::idx[E]);
-               const real_t dd_tmp_W  = src->get(x+1, y, z, Stencil::idx[W]);
-
-               const real_t  sym_E_W = lambda_e_scaled * ( dd_tmp_E + dd_tmp_W - fac1[x] * velX[x] * velX[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( dd_tmp_E - dd_tmp_W - real_t(3.0) * t1x2[x] * velX[x] );
-
-               dst->get( x, y, z, Stencil::idx[E] ) = dd_tmp_E - sym_E_W - asym_E_W;
-               dst->get( x, y, z, Stencil::idx[W] ) = dd_tmp_W - sym_E_W + asym_E_W;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_T  = src->get(x, y, z-1, Stencil::idx[T]);
-               const real_t dd_tmp_B  = src->get(x, y, z+1, Stencil::idx[B]);
-
-               const real_t  sym_T_B = lambda_e_scaled * ( dd_tmp_T + dd_tmp_B  - fac1[x] * velZ[x] * velZ[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( dd_tmp_T - dd_tmp_B - real_t(3.0) * t1x2[x] * velZ[x] );
-
-               dst->get( x, y, z, Stencil::idx[T] ) = dd_tmp_T - sym_T_B - asym_T_B;
-               dst->get( x, y, z, Stencil::idx[B] ) = dd_tmp_B - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] t1x2;
-   delete[] t2x2;
-   delete[] fac1;
-   delete[] fac2;
-   delete[] feq_common;
-
-#ifdef _OPENMP
-   }
-#endif
-
-   src->swapDataPointers( dst );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                              std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                              LatticeModel_T::compressible &&
-                                                              std::is_same< typename LatticeModel_T::ForceModel::tag &&
-                                                              force_model::None_tag
-                                                              >::type
-   >::stream( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-{
-   PdfField_T * src( NULL );
-   PdfField_T * dst( NULL );
-
-   this->getFields( block, src, dst );
-
-   StreamEverything< LatticeModel_T >::execute( src, dst, numberOfGhostLayersToInclude );
-}
-
-template< typename LatticeModel_T >
-void SplitPureSweep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, collision_model::TRT_tag >::value &&
-                                                                           std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value &&
-                                                                           LatticeModel_T::compressible &&
-                                                                           std::is_same< typename LatticeModel_T::ForceModel::tag, force_model::None_tag >::value
-                                                                           >::type
-#ifdef NDEBUG
-   >::collide( IBlock * const block, const uint_t /*numberOfGhostLayersToInclude*/ )
-#else
-   >::collide( IBlock * const block, const uint_t numberOfGhostLayersToInclude )
-#endif
-{
-   WALBERLA_ASSERT_EQUAL( numberOfGhostLayersToInclude, uint_t(0) ); // the implementation right now doesn't support inclusion of ghost layers in collide step!
-
-   PdfField_T * src = this->getSrcField( block );
-
-   WALBERLA_ASSERT_NOT_NULLPTR( src );
-
-   WALBERLA_ASSERT_GREATER_EQUAL( src->nrOfGhostLayers(), numberOfGhostLayersToInclude );
-
-   // constants used during stream/collide
-
-   const real_t lambda_e =  src->latticeModel().collisionModel().lambda_e();
-   const real_t lambda_d =  src->latticeModel().collisionModel().lambda_d();
-
-   // common prefactors for calculating the equilibrium parts
-   const real_t t0_0   = real_t(1.0) / real_t(3.0);                 // 1/3      for C
-   const real_t t1x2_0 = real_t(1.0) / real_t(18.0) * real_t(2.0);  // 1/18 * 2 for N, S, W, E, T, B
-   const real_t t2x2_0 = real_t(1.0) / real_t(36.0) * real_t(2.0);  // 1/36 * 2 else
-
-   const real_t inv2csq2 = real_t(1.0) / ( real_t(2.0) * ( real_t(1.0) / real_t(3.0) ) * ( real_t(1.0) / real_t(3.0) ) ); //speed of sound related factor for equilibrium distribution function
-
-   // relaxation parameter variables
-   const real_t lambda_e_scaled = real_t(0.5) * lambda_e; // 0.5 times the usual value ...
-   const real_t lambda_d_scaled = real_t(0.5) * lambda_d; // ... due to the way of calculations
-
-   // loop constants
-
-   const cell_idx_t xSize = cell_idx_c( src->xSize() );
-   const cell_idx_t ySize = cell_idx_c( src->ySize() );
-   const cell_idx_t zSize = cell_idx_c( src->zSize() );
-
-#ifdef _OPENMP
-   #pragma omp parallel
-   {
-#endif
-   // temporaries, calculated by the first innermost loop
-
-   real_t* velX = new real_t[ uint_c(xSize) ];
-   real_t* velY = new real_t[ uint_c(xSize) ];
-   real_t* velZ = new real_t[ uint_c(xSize) ];
-
-   real_t* t1x2 = new real_t[ uint_c(xSize) ];
-   real_t* t2x2 = new real_t[ uint_c(xSize) ];
-   real_t* fac1 = new real_t[ uint_c(xSize) ];
-   real_t* fac2 = new real_t[ uint_c(xSize) ];
-
-   real_t* feq_common = new real_t[ uint_c(xSize) ];
-
-   if( src->layout() == field::fzyx )
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            real_t* pC  = &src->get( 0, y, z, Stencil::idx[C]);
-            real_t* pN  = &src->get( 0, y, z, Stencil::idx[N]);
-            real_t* pS  = &src->get( 0, y, z, Stencil::idx[S]);
-            real_t* pW  = &src->get( 0, y, z, Stencil::idx[W]);
-            real_t* pE  = &src->get( 0, y, z, Stencil::idx[E]);
-            real_t* pT  = &src->get( 0, y, z, Stencil::idx[T]);
-            real_t* pB  = &src->get( 0, y, z, Stencil::idx[B]);
-            real_t* pNW = &src->get( 0, y, z, Stencil::idx[NW]);
-            real_t* pNE = &src->get( 0, y, z, Stencil::idx[NE]);
-            real_t* pSW = &src->get( 0, y, z, Stencil::idx[SW]);
-            real_t* pSE = &src->get( 0, y, z, Stencil::idx[SE]);
-            real_t* pTN = &src->get( 0, y, z, Stencil::idx[TN]);
-            real_t* pTS = &src->get( 0, y, z, Stencil::idx[TS]);
-            real_t* pTW = &src->get( 0, y, z, Stencil::idx[TW]);
-            real_t* pTE = &src->get( 0, y, z, Stencil::idx[TE]);
-            real_t* pBN = &src->get( 0, y, z, Stencil::idx[BN]);
-            real_t* pBS = &src->get( 0, y, z, Stencil::idx[BS]);
-            real_t* pBW = &src->get( 0, y, z, Stencil::idx[BW]);
-            real_t* pBE = &src->get( 0, y, z, Stencil::idx[BE]);
-
-            __alignx( 32, velX );
-            __alignx( 32, velY );
-            __alignx( 32, velZ );
-            __alignx( 32, t1x2 );
-            __alignx( 32, t2x2 );
-            __alignx( 32, fac1 );
-            __alignx( 32, fac2 );
-            __alignx( 32, feq_common  );
-
-            __alignx( 32, pNE );
-            __alignx( 32, pN  );
-            __alignx( 32, pNW );
-            __alignx( 32, pW  );
-            __alignx( 32, pSW );
-            __alignx( 32, pS  );
-            __alignx( 32, pSE );
-            __alignx( 32, pE  );
-            __alignx( 32, pT  );
-            __alignx( 32, pTE );
-            __alignx( 32, pTN );
-            __alignx( 32, pTW );
-            __alignx( 32, pTS );
-            __alignx( 32, pB  );
-            __alignx( 32, pBE );
-            __alignx( 32, pBN );
-            __alignx( 32, pBW );
-            __alignx( 32, pBS );
-            __alignx( 32, pC  );
-
-            #pragma disjoint( *velX, *velY, *velZ, *t1x2, *t2x2, *fac1, *fac2, *feq_common, *pNE, *pN, *pNW, *pW, *pSW, *pS, *pSE, \
-                              *pE, *pT, *pTE, *pTN, *pTW, *pTS, *pB, *pBE, *pBN, *pBW, *pBS, *pC, \
-                              t0_0, t1x2_0, t2x2_0, inv2csq2, lambda_e_scaled, lambda_d_scaled )
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velX_trm = pE[x] + pNE[x] + pSE[x] + pTE[x] + pBE[x];
-               const real_t velY_trm = pN[x] + pNW[x] + pTN[x] + pBN[x];
-               const real_t velZ_trm = pT[x] + pTS[x] + pTW[x];
-
-               const real_t rho = pC[x] + pS[x] + pW[x] + pB[x] + pSW[x] + pBS[x] + pBW[x] + velX_trm + velY_trm + velZ_trm;
-               const real_t invRho = real_t(1.0) / rho;
-
-               velX[x] = invRho * ( velX_trm - pW[x]  - pNW[x] - pSW[x] - pTW[x] - pBW[x] );
-               velY[x] = invRho * ( velY_trm + pNE[x] - pS[x]  - pSW[x] - pSE[x] - pTS[x] - pBS[x] );
-               velZ[x] = invRho * ( velZ_trm + pTN[x] + pTE[x] - pB[x]  - pBN[x] - pBS[x] - pBW[x] - pBE[x] );
-
-               t1x2[x] = t1x2_0 * rho;
-               t2x2[x] = t2x2_0 * rho;
-               fac1[x] = t1x2_0 * rho * inv2csq2;
-               fac2[x] = t2x2_0 * rho * inv2csq2;
-
-               feq_common[x] = real_t(1.0) - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               pC[x] = pC[x] * (real_t(1.0) - lambda_e) + lambda_e * t0_0 * rho * feq_common[x];
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( pNE[x] + pSW[x] - fac2[x] * velXPY * velXPY - t2x2[x] * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( pNE[x] - pSW[x] - real_t(3.0) * t2x2[x] * velXPY );
-
-               pNE[x] = pNE[x] - sym_NE_SW - asym_NE_SW;
-               pSW[x] = pSW[x] - sym_NE_SW + asym_NE_SW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( pSE[x] + pNW[x] - fac2[x] * velXMY * velXMY - t2x2[x] * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( pSE[x] - pNW[x] - real_t(3.0) * t2x2[x] * velXMY );
-
-               pSE[x] = pSE[x] - sym_SE_NW - asym_SE_NW;
-               pNW[x] = pNW[x] - sym_SE_NW + asym_SE_NW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( pTE[x] + pBW[x] - fac2[x] * velXPZ * velXPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( pTE[x] - pBW[x] - real_t(3.0) * t2x2[x] * velXPZ );
-
-               pTE[x] = pTE[x] - sym_TE_BW - asym_TE_BW;
-               pBW[x] = pBW[x] - sym_TE_BW + asym_TE_BW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( pBE[x] + pTW[x] - fac2[x] * velXMZ * velXMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( pBE[x] - pTW[x] - real_t(3.0) * t2x2[x] * velXMZ );
-
-               pBE[x] = pBE[x] - sym_BE_TW - asym_BE_TW;
-               pTW[x] = pTW[x] - sym_BE_TW + asym_BE_TW;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( pTN[x] + pBS[x] - fac2[x] * velYPZ * velYPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( pTN[x] - pBS[x] - real_t(3.0) * t2x2[x] * velYPZ );
-
-               pTN[x] = pTN[x] - sym_TN_BS - asym_TN_BS;
-               pBS[x] = pBS[x] - sym_TN_BS + asym_TN_BS;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( pBN[x] + pTS[x] - fac2[x] * velYMZ * velYMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( pBN[x] - pTS[x] - real_t(3.0) * t2x2[x] * velYMZ );
-
-               pBN[x] = pBN[x] - sym_BN_TS - asym_BN_TS;
-               pTS[x] = pTS[x] - sym_BN_TS + asym_BN_TS;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_N_S = lambda_e_scaled * ( pN[x] + pS[x] - fac1[x] * velY[x] * velY[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( pN[x] - pS[x] - real_t(3.0) * t1x2[x] * velY[x] );
-
-               pN[x] = pN[x] - sym_N_S - asym_N_S;
-               pS[x] = pS[x] - sym_N_S + asym_N_S;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_E_W = lambda_e_scaled * ( pE[x] + pW[x] - fac1[x] * velX[x] * velX[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( pE[x] - pW[x] - real_t(3.0) * t1x2[x] * velX[x] );
-
-               pE[x] = pE[x] - sym_E_W - asym_E_W;
-               pW[x] = pW[x] - sym_E_W + asym_E_W;
-            }
-
-            #pragma ibm iterations(100)
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t  sym_T_B = lambda_e_scaled * ( pT[x] + pB[x]  - fac1[x] * velZ[x] * velZ[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( pT[x] - pB[x] - real_t(3.0) * t1x2[x] * velZ[x] );
-
-               pT[x] = pT[x] - sym_T_B - asym_T_B;
-               pB[x] = pB[x] - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-   else // ==> src->layout() == field::zyxf
-   {
-      #ifdef _OPENMP
-      const int izSize = int_c( zSize );
-      #pragma omp for schedule(static)
-      for( int iz = 0; iz < izSize; ++iz ) {
-         cell_idx_t z = cell_idx_c( iz );
-      #else
-      for( cell_idx_t z = 0; z < zSize; ++z ) {
-      #endif
-         for( cell_idx_t y = 0; y != ySize; ++y )
-         {
-            using namespace stencil;
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_C  = src->get( x, y, z, Stencil::idx[C]  );
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]  );
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]  );
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]  );
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]  );
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]  );
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]  );
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW] );
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE] );
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW] );
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE] );
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN] );
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS] );
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW] );
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE] );
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN] );
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS] );
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW] );
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE] );
-
-               const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE;
-               const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN;
-               const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW;
-
-               const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm;
-               const real_t invRho = real_t(1.0) / rho;
-
-               velX[x] = invRho * ( velX_trm - dd_tmp_W  - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW );
-               velY[x] = invRho * ( velY_trm + dd_tmp_NE - dd_tmp_S  - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS );
-               velZ[x] = invRho * ( velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B  - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE );
-
-               t1x2[x] = t1x2_0 * rho;
-               t2x2[x] = t2x2_0 * rho;
-               fac1[x] = t1x2_0 * rho * inv2csq2;
-               fac2[x] = t2x2_0 * rho * inv2csq2;
-
-               feq_common[x] = real_t(1.0) - real_t(1.5) * ( velX[x] * velX[x] + velY[x] * velY[x] + velZ[x] * velZ[x] );
-
-               src->get( x, y, z, Stencil::idx[C] ) = dd_tmp_C * (real_t(1.0) - lambda_e) + lambda_e * t0_0 * rho * feq_common[x];
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_NE = src->get( x, y, z, Stencil::idx[NE]);
-               const real_t dd_tmp_SW = src->get( x, y, z, Stencil::idx[SW]);
-
-               const real_t velXPY = velX[x] + velY[x];
-               const real_t  sym_NE_SW = lambda_e_scaled * ( dd_tmp_NE + dd_tmp_SW - fac2[x] * velXPY * velXPY - t2x2[x] * feq_common[x] );
-               const real_t asym_NE_SW = lambda_d_scaled * ( dd_tmp_NE - dd_tmp_SW - real_t(3.0) * t2x2[x] * velXPY );
-
-               src->get( x, y, z, Stencil::idx[NE] ) = dd_tmp_NE - sym_NE_SW - asym_NE_SW;
-               src->get( x, y, z, Stencil::idx[SW] ) = dd_tmp_SW - sym_NE_SW + asym_NE_SW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_SE = src->get( x, y, z, Stencil::idx[SE]);
-               const real_t dd_tmp_NW = src->get( x, y, z, Stencil::idx[NW]);
-
-               const real_t velXMY = velX[x] - velY[x];
-               const real_t  sym_SE_NW = lambda_e_scaled * ( dd_tmp_SE + dd_tmp_NW - fac2[x] * velXMY * velXMY - t2x2[x] * feq_common[x] );
-               const real_t asym_SE_NW = lambda_d_scaled * ( dd_tmp_SE - dd_tmp_NW - real_t(3.0) * t2x2[x] * velXMY );
-
-               src->get( x, y, z, Stencil::idx[SE] ) = dd_tmp_SE - sym_SE_NW - asym_SE_NW;
-               src->get( x, y, z, Stencil::idx[NW] ) = dd_tmp_NW - sym_SE_NW + asym_SE_NW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TE = src->get( x, y, z, Stencil::idx[TE]);
-               const real_t dd_tmp_BW = src->get( x, y, z, Stencil::idx[BW]);
-
-               const real_t velXPZ = velX[x] + velZ[x];
-               const real_t  sym_TE_BW = lambda_e_scaled * ( dd_tmp_TE + dd_tmp_BW - fac2[x] * velXPZ * velXPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TE_BW = lambda_d_scaled * ( dd_tmp_TE - dd_tmp_BW - real_t(3.0) * t2x2[x] * velXPZ );
-
-               src->get( x, y, z, Stencil::idx[TE] ) = dd_tmp_TE - sym_TE_BW - asym_TE_BW;
-               src->get( x, y, z, Stencil::idx[BW] ) = dd_tmp_BW - sym_TE_BW + asym_TE_BW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BE = src->get( x, y, z, Stencil::idx[BE]);
-               const real_t dd_tmp_TW = src->get( x, y, z, Stencil::idx[TW]);
-
-               const real_t velXMZ = velX[x] - velZ[x];
-               const real_t  sym_BE_TW = lambda_e_scaled * ( dd_tmp_BE + dd_tmp_TW - fac2[x] * velXMZ * velXMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BE_TW = lambda_d_scaled * ( dd_tmp_BE - dd_tmp_TW - real_t(3.0) * t2x2[x] * velXMZ );
-
-               src->get( x, y, z, Stencil::idx[BE] ) = dd_tmp_BE - sym_BE_TW - asym_BE_TW;
-               src->get( x, y, z, Stencil::idx[TW] ) = dd_tmp_TW - sym_BE_TW + asym_BE_TW;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_TN = src->get( x, y, z, Stencil::idx[TN]);
-               const real_t dd_tmp_BS = src->get( x, y, z, Stencil::idx[BS]);
-
-               const real_t velYPZ = velY[x] + velZ[x];
-               const real_t  sym_TN_BS = lambda_e_scaled * ( dd_tmp_TN + dd_tmp_BS - fac2[x] * velYPZ * velYPZ - t2x2[x] * feq_common[x] );
-               const real_t asym_TN_BS = lambda_d_scaled * ( dd_tmp_TN - dd_tmp_BS - real_t(3.0) * t2x2[x] * velYPZ );
-
-               src->get( x, y, z, Stencil::idx[TN] ) = dd_tmp_TN - sym_TN_BS - asym_TN_BS;
-               src->get( x, y, z, Stencil::idx[BS] ) = dd_tmp_BS - sym_TN_BS + asym_TN_BS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_BN = src->get( x, y, z, Stencil::idx[BN]);
-               const real_t dd_tmp_TS = src->get( x, y, z, Stencil::idx[TS]);
-
-               const real_t velYMZ = velY[x] - velZ[x];
-               const real_t  sym_BN_TS = lambda_e_scaled * ( dd_tmp_BN + dd_tmp_TS - fac2[x] * velYMZ * velYMZ - t2x2[x] * feq_common[x] );
-               const real_t asym_BN_TS = lambda_d_scaled * ( dd_tmp_BN - dd_tmp_TS - real_t(3.0) * t2x2[x] * velYMZ );
-
-               src->get( x, y, z, Stencil::idx[BN] ) = dd_tmp_BN - sym_BN_TS - asym_BN_TS;
-               src->get( x, y, z, Stencil::idx[TS] ) = dd_tmp_TS - sym_BN_TS + asym_BN_TS;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_N  = src->get( x, y, z, Stencil::idx[N]);
-               const real_t dd_tmp_S  = src->get( x, y, z, Stencil::idx[S]);
-
-               const real_t  sym_N_S = lambda_e_scaled * ( dd_tmp_N + dd_tmp_S - fac1[x] * velY[x] * velY[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_N_S = lambda_d_scaled * ( dd_tmp_N - dd_tmp_S - real_t(3.0) * t1x2[x] * velY[x] );
-
-               src->get( x, y, z, Stencil::idx[N] ) = dd_tmp_N - sym_N_S - asym_N_S;
-               src->get( x, y, z, Stencil::idx[S] ) = dd_tmp_S - sym_N_S + asym_N_S;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_E  = src->get( x, y, z, Stencil::idx[E]);
-               const real_t dd_tmp_W  = src->get( x, y, z, Stencil::idx[W]);
-
-               const real_t  sym_E_W = lambda_e_scaled * ( dd_tmp_E + dd_tmp_W - fac1[x] * velX[x] * velX[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_E_W = lambda_d_scaled * ( dd_tmp_E - dd_tmp_W - real_t(3.0) * t1x2[x] * velX[x] );
-
-               src->get( x, y, z, Stencil::idx[E] ) = dd_tmp_E - sym_E_W - asym_E_W;
-               src->get( x, y, z, Stencil::idx[W] ) = dd_tmp_W - sym_E_W + asym_E_W;
-            }
-
-            for( cell_idx_t x = 0; x != xSize; ++x )
-            {
-               const real_t dd_tmp_T  = src->get( x, y, z, Stencil::idx[T]);
-               const real_t dd_tmp_B  = src->get( x, y, z, Stencil::idx[B]);
-
-               const real_t  sym_T_B = lambda_e_scaled * ( dd_tmp_T + dd_tmp_B  - fac1[x] * velZ[x] * velZ[x] - t1x2[x] * feq_common[x] );
-               const real_t asym_T_B = lambda_d_scaled * ( dd_tmp_T - dd_tmp_B - real_t(3.0) * t1x2[x] * velZ[x] );
-
-               src->get( x, y, z, Stencil::idx[T] ) = dd_tmp_T - sym_T_B - asym_T_B;
-               src->get( x, y, z, Stencil::idx[B] ) = dd_tmp_B - sym_T_B + asym_T_B;
-            }
-         }
-      }
-   }
-
-   delete[] velX;
-   delete[] velY;
-   delete[] velZ;
-   delete[] t1x2;
-   delete[] t2x2;
-   delete[] fac1;
-   delete[] fac2;
-   delete[] feq_common;
-
-#ifdef _OPENMP
-   }
-#endif
-}
-
-
-
-} // namespace lbm
-} // namespace walberla
-
-#endif
diff --git a/src/simd/QPX.h b/src/simd/QPX.h
deleted file mode 100644
index cccada7b3..000000000
--- a/src/simd/QPX.h
+++ /dev/null
@@ -1,203 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file QPX.h
-//! \ingroup phasefield
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-
-namespace walberla {
-namespace simd {
-namespace qpx {
-
-struct double4_t
-{
-   double4_t() {}
-   double4_t( vector4double init ) : v(init) {}
-   
-   operator vector4double() const { return v; }  
-   double   operator[]( int i ) const { return v[i]; }
-   double & operator[]( int i )       { return v[i]; }
-   
-   vector4double v;
-};
-//typedef vector4double double4_t;
-
-inline const char * usedInstructionSet() { return "QPX"; }
-
-inline double4_t make_double4   ( double d, double c, double b, double a ) { return double4_t( (vector4double) {a,b,c,d} ); }
-inline double4_t make_double4_r ( double a, double b, double c, double d ) { return double4_t( (vector4double) {a,b,c,d} ); }
-
-inline double4_t make_double4   ( double a                               ) { return vec_splats(a); }
-
-inline double4_t make_zero()   { return vec_splats(0.0); }
-
-inline double4_t load_aligned  ( const double * mem_addr )          { return vec_ld(0ul, const_cast<double*>(mem_addr) ); }
-inline void      store_aligned ( double * mem_addr, double4_t a )   { vec_st( a, 0ul, mem_addr);                          }
-
-inline double4_t load_unaligned  ( const double * mem_addr )
-{
-   double * p =  const_cast<double*>( mem_addr );
-   vector4double v1,v2,xv,pctl;
-   v1 = vec_ld ( 0ul, p   );
-   v2 = vec_ld ( 0ul, p+4 );
-
-   pctl = vec_lvsl( 0, p );
-   xv   = vec_perm( v1, v2, pctl );
-   return xv;
-}
-
-inline void loadNeighbors( const double * mem_addr, double4_t & r_left, double4_t & r_center, double4_t & r_right )
-{
-    double * p =  const_cast<double*>( mem_addr );
-
-    vector4double left   = vec_ld(0, const_cast<double*>( p-4 ) );
-    vector4double center = vec_ld(0, const_cast<double*>( p   ) );
-    vector4double right  = vec_ld(0, const_cast<double*>( p+4 ) );
-
-    vector4double ctrlLeft  = vec_lvsl( 0, p-1 );
-    vector4double ctrlRight = vec_lvsl( 0, p+1 );
-    r_left   = vec_perm( left, center, ctrlLeft  );
-    r_center = center;
-    r_right  = vec_perm( center,right, ctrlRight );
-}
-
-inline double getComponent ( const double4_t & v, int i )           { return v[i]; }
-inline double getComponent ( const double4_t & v, unsigned long i ) { return v[i]; }
-
-inline bool   getBoolComponent ( const double4_t & v, int i           ) { return (reinterpret_cast<const uint64_t*>(&v)[i]) != 0; }
-inline bool   getBoolComponent ( const double4_t & v, unsigned long i ) { return (reinterpret_cast<const uint64_t*>(&v)[i]) != 0; }
-
-inline double4_t hadd( double4_t a,  double4_t b )
-{
-   //TODO is there an instruction for this?
-   return make_double4_r( a[0] + a[1],
-                          b[0] + b[1],
-                          a[2] + a[3],
-                          b[2] + b[3] );
-}
-
-inline double4_t horizontalSum ( double4_t a )
-{
-   //TODO is there an instruction for this?
-   return vec_splats( a[0] + a[1] + a[2] + a[3] );
-}
-
-inline double4_t exchangeLowerUpperHalf ( double4_t a )
-{
-   static const vector4double permCode = vec_gpci( 0x4c1 ); // hex for octal 2301
-   return vec_perm( a, a, permCode );
-}
-
-
-inline void extract( double4_t in, double4_t &d, double4_t &c, double4_t &b, double4_t & a )
-{
-   a = vec_splat( in, 0) ;
-   b = vec_splat( in, 1 );
-   c = vec_splat( in, 2 );
-   d = vec_splat( in, 3 );
-}
-
-inline double4_t rotateRight( double4_t a )
-{
-   static const vector4double permCode = vec_gpci( 0x298 ); // hex for octal 1230
-   return vec_perm( a, a, permCode );
-}
-
-inline double4_t rotateLeft( double4_t a )
-{
-   static const vector4double permCode = vec_gpci( 0x60a ); // hex for octal 3012
-   return vec_perm( a, a, permCode );
-}
-
-
-inline double4_t compareEQ( double4_t a, double4_t b ) {
-   return vec_cmpeq( a,b );
-}
-inline double4_t compareNEQ( double4_t a, double4_t b ) {
-   return vec_not( vec_cmpeq(a,b) );
-}
-inline double4_t compareGE( double4_t a, double4_t b ) {
-   return vec_not( vec_cmplt(a,b) );
-}
-inline double4_t compareLE( double4_t a, double4_t b ) {
-   return vec_not( vec_cmpgt(a,b) );
-}
-
-inline double4_t logicalAND( double4_t a, double4_t b ) {
-   return vec_and( a,b );
-}
-inline double4_t logicalOR( double4_t a, double4_t b ) {
-   return vec_or( a,b );
-}
-
-
-
-inline int movemask( double4_t a )
-{
-   int result = 0;
-   if ( a[3] == 1.0 ) result |= ( 1 << 3);
-   if ( a[2] == 1.0 ) result |= ( 1 << 2);
-   if ( a[1] == 1.0 ) result |= ( 1 << 1);
-   if ( a[0] == 1.0 ) result |= ( 1 << 0);
-   return result;
-}
-
-inline double4_t blendv( double4_t a, double4_t b, double4_t mask)
-{
-   return vec_sel( a,b, mask );
-}
-
-template<int mask>
-inline double4_t blend( double4_t a, double4_t b )
-{
-   const double m0 = mask & ( 1<<0 ) ? 1.0 : -1.0;
-   const double m1 = mask & ( 1<<1 ) ? 1.0 : -1.0;
-   const double m2 = mask & ( 1<<2 ) ? 1.0 : -1.0;
-   const double m3 = mask & ( 1<<3 ) ? 1.0 : -1.0;
-
-   const vector4double vec_mask = { m0,m1,m2,m3};
-   return vec_sel( a,b,vec_mask);
-}
-
-
-inline double4_t sqrt( double4_t a) {
-   return vec_swsqrt_nochk (a );
-}
-
-template< unsigned int numIter>
-inline double4_t invSqrt( double4_t y )
-{
-   return vec_rsqrte( y);
-}
-
-
-inline double4_t operator+( const double4_t & a, const double4_t & b ) { return vec_add ( a, b); }
-inline double4_t operator-( const double4_t & a, const double4_t & b ) { return vec_sub ( a, b); }
-inline double4_t operator*( const double4_t & a, const double4_t & b ) { return vec_mul  ( a, b); }
-inline double4_t operator/( const double4_t & a, const double4_t & b ) { return vec_swdivs_nochk ( a, b); }
-
-
-
-} // namespace qpx
-} // namespace simd
-} // namespace walberla
-
-
-
diff --git a/src/simd/SIMD.h b/src/simd/SIMD.h
index 2998cef34..52ac3ffd1 100644
--- a/src/simd/SIMD.h
+++ b/src/simd/SIMD.h
@@ -50,10 +50,6 @@
 #define WALBERLA_SIMD_SSE2_AVAILABLE 1
 #endif
 
-#ifdef __VECTOR4DOUBLE__
-#define WALBERLA_SIMD_QPX_AVAILABLE 1
-#endif
-
 
 
 
@@ -66,13 +62,6 @@
 
 #ifndef WALBERLA_SIMD_FORCE_SCALAR
 
-// QPX (IBM BlueGeneQ)
-#if defined( WALBERLA_SIMD_QPX_AVAILABLE ) && !defined( WALBERLA_USE_SIMD )
-#include "QPX.h"
-#define WALBERLA_USE_QPX 1
-#define WALBERLA_USE_SIMD 1
-#endif
-
 // AVX2 ( Intel Haswell )
 #if defined( WALBERLA_SIMD_AVX2_AVAILABLE )
 #include "AVX2.h"
@@ -156,12 +145,6 @@ template<typename T> struct is_vector4_type {  static const bool value = false;
 #endif
 
 
-#ifdef WALBERLA_USE_QPX
-   using namespace qpx;
-   template<> struct is_vector4_type<qpx::double4_t> {  static const bool value = true; };
-#endif
-
-
 #ifdef WALBERLA_USE_SCALAR_SIMD_EMULATION
    using namespace scalar;
    template<> struct is_vector4_type<scalar::double4_t> {  static const bool value = true; };
diff --git a/tests/simd/CMakeLists.txt b/tests/simd/CMakeLists.txt
index 843dd9315..64436a413 100644
--- a/tests/simd/CMakeLists.txt
+++ b/tests/simd/CMakeLists.txt
@@ -34,9 +34,3 @@ waLBerla_execute_test( NAME   SSE4_SSE2_Equivalence )
 waLBerla_compile_test( NAME   AVX_Scalar_Equivalence FILES SIMD_Equivalence.cpp  )
 set_property         ( TARGET AVX_Scalar_Equivalence PROPERTY COMPILE_FLAGS "${MarchNativeString} -DIS0_AVX -DIS1_SCALAR" )
 waLBerla_execute_test( NAME   AVX_Scalar_Equivalence )
-
-
-waLBerla_compile_test( NAME   QPX_Scalar_Equivalence FILE SIMD_Equivalence.cpp  )
-set_property         ( TARGET QPX_Scalar_Equivalence PROPERTY COMPILE_FLAGS "-DIS0_QPX -DIS1_SCALAR" )
-waLBerla_execute_test( NAME   QPX_Scalar_Equivalence )
-
diff --git a/tests/simd/SIMD_Equivalence.cpp b/tests/simd/SIMD_Equivalence.cpp
index 5aeec8ef8..f90c69f4f 100644
--- a/tests/simd/SIMD_Equivalence.cpp
+++ b/tests/simd/SIMD_Equivalence.cpp
@@ -52,10 +52,6 @@
 #include "simd/SSE2.h"
 #endif
 
-#ifdef WALBERLA_SIMD_QPX_AVAILABLE
-#include "simd/QPX.h"
-#endif
-
 #include "simd/Scalar.h"
 
 #include "simd/StreamOutput.h"
@@ -137,23 +133,6 @@
 #endif
 
 
-// ---------------- QPX   ------------
-#ifdef IS0_QPX
-#ifdef WALBERLA_SIMD_QPX_AVAILABLE
-#define is0 qpx
-#else
-#define is0 scalar
-#endif
-#endif
-
-#ifdef IS1_QPX
-#ifdef WALBERLA_SIMD_QPX_AVAILABLE
-#define is1 qpx
-#else
-#define is1 scalar
-#endif
-#endif
-
 // ---------------- Scalar -----------
 #ifdef IS0_SCALAR
 #define is0 scalar
-- 
GitLab