diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 0698ea06c8016d770db8ef29b690d5bde07e8e54..74d5e5c714ff810f60a454f7062c230635e928c2 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -2,5 +2,7 @@
 waLBerla_link_files_to_builddir( "*.prm" )
 
 waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
-                          FILES UniformGridGPU.cpp UniformGridGPU.gen.py
+                          FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp
+                                UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu
+                                UniformGridGPU_PackInfo.cu
                           DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
index 53a3c051288f666379b4055e9e57e1b90827730b..2b340e0ef9434cc9fdc60c8bae2e643bb96670fc 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -2,7 +2,7 @@
 Parameters 
 {
 	omega           1.8;
-	timesteps       10;
+	timesteps       1000;
 
 	remainingTimeLoggerFrequency 3;
 	vtkWriteFrequency 0;
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f7a55cd5b3715a8c3c96f65fbd5b648caf38fee
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp
@@ -0,0 +1,552 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\author Martin Bauer <martin.bauer@fau.de>
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/sweeps/Streaming.h"
+#include "UniformGridGPU_LatticeModel.h"
+
+#ifdef _MSC_VER
+#  pragma warning( disable : 4458 )
+#endif
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+namespace internal_kernel_streamCollide {
+static FUNC_PREFIX void kernel_streamCollide(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega)
+{
+   const double xi_1 = omega*0.166666666666667;
+   const double xi_5 = omega*0.0416666666666667;
+   for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1)
+   {
+      double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2;
+      double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1)
+      {
+         double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_318;
+         for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1)
+         {
+            const double xi_18 = -_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double xi_19 = -_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double xi_20 = -_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double xi_27 = rho*-0.333333333333333;
+            const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double xi_23 = (u_0*u_0);
+            const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            const double xi_21 = -u_1;
+            const double xi_24 = (u_1*u_1);
+            const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double xi_22 = -u_2;
+            const double xi_25 = (u_2*u_2);
+            const double u0Mu1 = u_0 + xi_21;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + xi_22;
+            const double u0Mu2 = u_0 + xi_22;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = rho - xi_23 - xi_24 - xi_25;
+            const double xi_26 = f_eq_common + rho*-0.666666666666667;
+            const double xi_28 = f_eq_common + xi_25 + xi_27;
+            const double xi_29 = f_eq_common + xi_23 + xi_27;
+            const double xi_30 = f_eq_common + xi_24 + xi_27;
+            const double xi_2 = xi_24*2 + xi_26;
+            const double xi_3 = xi_23*2 + xi_26;
+            const double xi_4 = xi_25*2 + xi_26;
+            const double xi_6 = u0Mu1*2;
+            const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28;
+            const double xi_8 = u0Pu1*2;
+            const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28;
+            const double xi_10 = u1Pu2*2;
+            const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29;
+            const double xi_12 = u1Mu2*2;
+            const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29;
+            const double xi_14 = u0Mu2*2;
+            const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30;
+            const double xi_16 = u0Pu2*2;
+            const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30;
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+         }
+      }
+   }
+}
+}
+namespace internal_kernel_collide {
+static FUNC_PREFIX void kernel_collide(double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega)
+{
+   const double xi_1 = omega*0.166666666666667;
+   const double xi_5 = omega*0.0416666666666667;
+   for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1)
+   {
+      double * _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1)
+      {
+         double * _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1)
+         {
+            const double xi_18 = -_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0];
+            const double xi_19 = -_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double xi_20 = -_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0];
+            const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double xi_27 = rho*-0.333333333333333;
+            const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double xi_23 = (u_0*u_0);
+            const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double xi_21 = -u_1;
+            const double xi_24 = (u_1*u_1);
+            const double u_2 = vel2Term + xi_18 + xi_20 + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0];
+            const double xi_22 = -u_2;
+            const double xi_25 = (u_2*u_2);
+            const double u0Mu1 = u_0 + xi_21;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + xi_22;
+            const double u0Mu2 = u_0 + xi_22;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = rho - xi_23 - xi_24 - xi_25;
+            const double xi_26 = f_eq_common + rho*-0.666666666666667;
+            const double xi_28 = f_eq_common + xi_25 + xi_27;
+            const double xi_29 = f_eq_common + xi_23 + xi_27;
+            const double xi_30 = f_eq_common + xi_24 + xi_27;
+            const double xi_2 = xi_24*2 + xi_26;
+            const double xi_3 = xi_23*2 + xi_26;
+            const double xi_4 = xi_25*2 + xi_26;
+            const double xi_6 = u0Mu1*2;
+            const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28;
+            const double xi_8 = u0Pu1*2;
+            const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28;
+            const double xi_10 = u1Pu2*2;
+            const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29;
+            const double xi_12 = u1Mu2*2;
+            const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29;
+            const double xi_14 = u0Mu2*2;
+            const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30;
+            const double xi_16 = u0Pu2*2;
+            const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30;
+            _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0];
+         }
+      }
+   }
+}
+}
+namespace internal_kernel_stream {
+static FUNC_PREFIX void kernel_stream(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3)
+{
+   for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1)
+   {
+      double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1)
+      {
+         double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1)
+         {
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+         }
+      }
+   }
+}
+}
+
+
+const real_t UniformGridGPU_LatticeModel::w[19] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+const real_t UniformGridGPU_LatticeModel::wInv[19] = { 3.00000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000 };
+
+void UniformGridGPU_LatticeModel::Sweep::streamCollide( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+    auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID);
+    GhostLayerField<double, 19> * pdfs_tmp;
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find( pdfs );
+    if( it != cache_pdfs_.end() )
+    {
+        pdfs_tmp = *it;
+    }
+    else 
+    {
+        pdfs_tmp = pdfs->cloneUninitialized();
+        cache_pdfs_.insert(pdfs_tmp);
+    }
+
+
+    auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel();
+    lm.configureBlock(block);
+
+    auto & omega = lm.omega;
+    WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers()));
+    double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    internal_kernel_streamCollide::kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+    pdfs->swapDataPointers(pdfs_tmp);
+
+}
+
+void UniformGridGPU_LatticeModel::Sweep::collide( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+   auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID);
+
+
+    auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel();
+    lm.configureBlock(block);
+
+    auto & omega = lm.omega;
+    WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers()));
+    double * _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    internal_kernel_collide::kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+
+
+void UniformGridGPU_LatticeModel::Sweep::stream( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+    auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID);
+    GhostLayerField<double, 19> * pdfs_tmp;
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find( pdfs );
+    if( it != cache_pdfs_.end() )
+    {
+        pdfs_tmp = *it;
+    }
+    else 
+    {
+        pdfs_tmp = pdfs->cloneUninitialized();
+        cache_pdfs_.insert(pdfs_tmp);
+    }
+
+
+    WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers()));
+    double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2));
+    const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+    const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+    const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+    const int64_t _stride_pdfs_tmp_3 = int64_t(pdfs_tmp->fStride());
+    internal_kernel_stream::kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+    
+    pdfs->swapDataPointers(pdfs_tmp);
+
+}
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+
+
+// Buffer Packing
+
+namespace walberla {
+namespace mpi {
+
+mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm)
+{
+    buf << lm.currentLevel;
+    return buf;
+}
+
+mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::lbm::UniformGridGPU_LatticeModel & lm)
+{
+    buf >> lm.currentLevel;
+    return buf;
+}
+
+
+} // namespace mpi
+} // namespace walberla
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f48737a156a49ce4e2b60f7241a4aaafd581ff1c
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h
@@ -0,0 +1,737 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "field/GhostLayerField.h"
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "stencil/D3Q19.h"
+
+#include "lbm/lattice_model/EquilibriumDistribution.h"
+#include "lbm/field/Density.h"
+#include "lbm/field/DensityAndMomentumDensity.h"
+#include "lbm/field/DensityAndVelocity.h"
+#include "lbm/field/PressureTensor.h"
+#include "lbm/field/ShearRate.h"
+
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_GNU
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+
+
+
+
+// Forward declarations
+namespace walberla{
+namespace lbm {
+   class UniformGridGPU_LatticeModel;
+}}
+namespace walberla {
+namespace mpi {
+    mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm);
+    mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf,       ::walberla::lbm::UniformGridGPU_LatticeModel & lm);
+}}
+
+
+
+
+namespace walberla {
+namespace lbm {
+
+
+/**
+UniformGridGPU_LatticeModel was generated with lbmpy. Do not edit this file directly. Instead modify UniformGridGPU_LatticeModel.py.
+For details see documentation of lbmpy.
+
+Usage:
+    - Create an instance of this lattice model class: the constructor parameters vary depending on the configure
+      lattice model. A model with constant force needs a single force vector, while a model with variable forces needs
+      a force field. All constructor parameters are ordered alphabetically.
+    - Create a PDFField with the lattice model as template argument to store the particle distribution functions.
+      Use the PDFField to get and modify macroscopic values.
+    - The internal class UniformGridGPU_LatticeModel::Sweep is a functor to execute one LB time step.
+      Stream, collide steps can be executed separately, or together in an optimized stream-pull-collide scheme
+
+*/
+class UniformGridGPU_LatticeModel
+{
+
+public:
+    typedef stencil::D3Q19 Stencil;
+    typedef stencil::D3Q19 CommunicationStencil;
+    static const real_t w[19];
+    static const real_t wInv[19];
+
+    static const bool compressible = false;
+    static const int equilibriumAccuracyOrder = 2;
+
+    class Sweep
+    {
+    public:
+        Sweep( BlockDataID _pdfsID ) : pdfsID(_pdfsID) {};
+
+        //void stream       ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void collide      ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void streamCollide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void stream       ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+
+        void operator() ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) )
+        {
+            streamCollide( block, numberOfGhostLayersToInclude );
+        }
+
+    private:
+        BlockDataID pdfsID;
+
+        std::set< GhostLayerField<double, 19> *, field::SwapableCompare< GhostLayerField<double, 19> * > > cache_pdfs_;
+    };
+
+    UniformGridGPU_LatticeModel( double omega_ )
+        : omega(omega_), currentLevel(0)
+    {};
+
+    void configure( IBlock & block, StructuredBlockStorage &)  { configureBlock( &block ); }
+
+private:
+    void configureBlock(IBlock * block)
+    {
+        
+
+
+        }
+
+    // Parameters:
+    double omega;
+
+    // Updated by configureBlock:
+    
+
+    uint_t currentLevel;
+
+    // Backend classes can access private members:
+    friend class UniformGridGPU_LatticeModel::Sweep;
+    template<class LM, class Enable> friend class  EquilibriumDistribution;
+    template<class LM, class Enable> friend struct Equilibrium;
+    template<class LM, class Enable> friend struct internal::AdaptVelocityToForce;
+    template<class LM, class Enable> friend struct Density;
+    template<class LM>               friend struct DensityAndVelocity;
+    template<class LM, class Enable> friend struct DensityAndMomentumDensity;
+    template<class LM, class Enable> friend struct MomentumDensity;
+    template<class LM, class It, class Enable> friend struct DensityAndVelocityRange;
+
+    friend mpi::SendBuffer & ::walberla::mpi::operator<< (mpi::SendBuffer & , const UniformGridGPU_LatticeModel & );
+    friend mpi::RecvBuffer & ::walberla::mpi::operator>> (mpi::RecvBuffer & ,       UniformGridGPU_LatticeModel & );
+
+};
+
+
+
+
+//======================================================================================================================
+//
+//  Implementation of macroscopic value backend
+//
+//======================================================================================================================
+
+
+
+template<>
+class EquilibriumDistribution< UniformGridGPU_LatticeModel, void>
+{
+public:
+   typedef typename UniformGridGPU_LatticeModel::Stencil Stencil;
+
+   static real_t get( const stencil::Direction direction,
+                      const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ),
+                      real_t rho = real_t(1.0) )
+   {
+        
+        rho -= real_t(1.0);
+        
+        
+    using namespace stencil;
+    switch( direction ) {
+        case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]);
+        case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1];
+        case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]);
+        case W: return rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]);
+        case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0];
+        case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2];
+        case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]);
+        case NW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1];
+        case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1];
+        case SW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1];
+        case SE: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]);
+        case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2];
+        case TS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+        case TW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+        case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2];
+        case BN: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]);
+        case BS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2];
+        case BW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2];
+        case BE: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]);
+        default:
+            WALBERLA_ABORT("Invalid Direction");
+    }
+    
+   }
+
+   static real_t getSymmetricPart( const stencil::Direction direction,
+                                   const Vector3<real_t> & u = Vector3< real_t >(real_t(0.0)),
+                                   real_t rho = real_t(1.0) )
+   {
+        
+        rho -= real_t(1.0);
+        
+        
+    using namespace stencil;
+    switch( direction ) {
+        case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]);
+        case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]);
+        case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]);
+        case W: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]);
+        case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]);
+        case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]);
+        case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]);
+        case NW: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]);
+        case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1];
+        case SW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1];
+        case SE: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]);
+        case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2];
+        case TS: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]);
+        case TW: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]);
+        case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2];
+        case BN: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]);
+        case BS: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2];
+        case BW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2];
+        case BE: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]);
+        default:
+            WALBERLA_ABORT("Invalid Direction");
+    }
+    
+   }
+
+   static real_t getAsymmetricPart( const stencil::Direction direction,
+                                    const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ),
+                                    real_t rho = real_t(1.0) )
+   {
+        
+        rho -= real_t(1.0);
+        
+        
+    using namespace stencil;
+    switch( direction ) {
+        case C: return 0;
+        case N: return 0.166666666666667*u[1];
+        case S: return -0.166666666666667*u[1];
+        case W: return -0.166666666666667*u[0];
+        case E: return 0.166666666666667*u[0];
+        case T: return 0.166666666666667*u[2];
+        case B: return -0.166666666666667*u[2];
+        case NW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[1];
+        case NE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[1];
+        case SW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[1];
+        case SE: return -0.0833333333333333*u[1] + 0.0833333333333333*u[0];
+        case TN: return 0.0833333333333333*u[1] + 0.0833333333333333*u[2];
+        case TS: return -0.0833333333333333*u[1] + 0.0833333333333333*u[2];
+        case TW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[2];
+        case TE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[2];
+        case BN: return -0.0833333333333333*u[2] + 0.0833333333333333*u[1];
+        case BS: return -0.0833333333333333*u[1] - 0.0833333333333333*u[2];
+        case BW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[2];
+        case BE: return -0.0833333333333333*u[2] + 0.0833333333333333*u[0];
+        default:
+            WALBERLA_ABORT("Invalid Direction");
+    }
+    
+   }
+
+   static std::vector< real_t > get( const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ),
+                                     real_t rho = real_t(1.0) )
+   {
+      
+      rho -= real_t(1.0);
+      
+
+      std::vector< real_t > equilibrium( Stencil::Size );
+      for( auto d = Stencil::begin(); d != Stencil::end(); ++d )
+      {
+         equilibrium[d.toIdx()] = get(*d, u, rho);
+      }
+      return equilibrium;
+   }
+};
+
+
+namespace internal {
+
+template<>
+struct AdaptVelocityToForce<UniformGridGPU_LatticeModel, void>
+{
+   template< typename FieldPtrOrIterator >
+   static Vector3<real_t> get( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm,
+                               const Vector3< real_t > & velocity, const real_t rho )
+   {
+      auto x = it.x();
+      auto y = it.y();
+      auto z = it.z();
+      
+      return velocity;
+      
+   }
+
+   static Vector3<real_t> get( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm,
+                               const Vector3< real_t > & velocity, const real_t rho )
+   {
+      
+      return velocity;
+      
+   }
+};
+} // namespace internal
+
+
+
+template<>
+struct Equilibrium< UniformGridGPU_LatticeModel, void >
+{
+
+   template< typename FieldPtrOrIterator >
+   static void set( FieldPtrOrIterator & it,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) )
+   {
+        
+        rho -= real_t(1.0);
+        
+
+       it[0] = rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]);
+       it[1] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1];
+       it[2] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]);
+       it[3] = rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]);
+       it[4] = rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0];
+       it[5] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2];
+       it[6] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]);
+       it[7] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1];
+       it[8] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1];
+       it[9] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1];
+       it[10] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]);
+       it[11] = rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2];
+       it[12] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+       it[13] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+       it[14] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2];
+       it[15] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]);
+       it[16] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2];
+       it[17] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2];
+       it[18] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]);
+       }
+
+   template< typename PdfField_T >
+   static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) )
+   {
+      
+      rho -= real_t(1.0);
+      
+
+      real_t & xyz0 = pdf(x,y,z,0);
+      pdf.getF( &xyz0, 0)= rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]);
+      pdf.getF( &xyz0, 1)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1];
+      pdf.getF( &xyz0, 2)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]);
+      pdf.getF( &xyz0, 3)= rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]);
+      pdf.getF( &xyz0, 4)= rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0];
+      pdf.getF( &xyz0, 5)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2];
+      pdf.getF( &xyz0, 6)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]);
+      pdf.getF( &xyz0, 7)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1];
+      pdf.getF( &xyz0, 8)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1];
+      pdf.getF( &xyz0, 9)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1];
+      pdf.getF( &xyz0, 10)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]);
+      pdf.getF( &xyz0, 11)= rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2];
+      pdf.getF( &xyz0, 12)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+      pdf.getF( &xyz0, 13)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2];
+      pdf.getF( &xyz0, 14)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2];
+      pdf.getF( &xyz0, 15)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]);
+      pdf.getF( &xyz0, 16)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2];
+      pdf.getF( &xyz0, 17)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2];
+      pdf.getF( &xyz0, 18)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]);
+      }
+};
+
+
+template<>
+struct Density<UniformGridGPU_LatticeModel, void>
+{
+   template< typename FieldPtrOrIterator >
+   static inline real_t get( const UniformGridGPU_LatticeModel & , const FieldPtrOrIterator & it )
+   {
+        const real_t f_0 = it[0];
+        const real_t f_1 = it[1];
+        const real_t f_2 = it[2];
+        const real_t f_3 = it[3];
+        const real_t f_4 = it[4];
+        const real_t f_5 = it[5];
+        const real_t f_6 = it[6];
+        const real_t f_7 = it[7];
+        const real_t f_8 = it[8];
+        const real_t f_9 = it[9];
+        const real_t f_10 = it[10];
+        const real_t f_11 = it[11];
+        const real_t f_12 = it[12];
+        const real_t f_13 = it[13];
+        const real_t f_14 = it[14];
+        const real_t f_15 = it[15];
+        const real_t f_16 = it[16];
+        const real_t f_17 = it[17];
+        const real_t f_18 = it[18];
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        return rho;
+   }
+
+   template< typename PdfField_T >
+   static inline real_t get( const UniformGridGPU_LatticeModel & ,
+                             const PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        const real_t f_0 = pdf.getF( &xyz0, 0);
+        const real_t f_1 = pdf.getF( &xyz0, 1);
+        const real_t f_2 = pdf.getF( &xyz0, 2);
+        const real_t f_3 = pdf.getF( &xyz0, 3);
+        const real_t f_4 = pdf.getF( &xyz0, 4);
+        const real_t f_5 = pdf.getF( &xyz0, 5);
+        const real_t f_6 = pdf.getF( &xyz0, 6);
+        const real_t f_7 = pdf.getF( &xyz0, 7);
+        const real_t f_8 = pdf.getF( &xyz0, 8);
+        const real_t f_9 = pdf.getF( &xyz0, 9);
+        const real_t f_10 = pdf.getF( &xyz0, 10);
+        const real_t f_11 = pdf.getF( &xyz0, 11);
+        const real_t f_12 = pdf.getF( &xyz0, 12);
+        const real_t f_13 = pdf.getF( &xyz0, 13);
+        const real_t f_14 = pdf.getF( &xyz0, 14);
+        const real_t f_15 = pdf.getF( &xyz0, 15);
+        const real_t f_16 = pdf.getF( &xyz0, 16);
+        const real_t f_17 = pdf.getF( &xyz0, 17);
+        const real_t f_18 = pdf.getF( &xyz0, 18);
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        return rho;
+   }
+};
+
+
+template<>
+struct DensityAndVelocity<UniformGridGPU_LatticeModel>
+{
+    template< typename FieldPtrOrIterator >
+    static void set( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm,
+                     const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+    {
+        auto x = it.x();
+        auto y = it.y();
+        auto z = it.z();
+
+        const double rho = rho_in - 1;
+        const double u_0 = u[0];
+        const double u_1 = u[1];
+        const double u_2 = u[2];
+        
+
+        Equilibrium<UniformGridGPU_LatticeModel>::set(it, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) );
+    }
+
+    template< typename PdfField_T >
+    static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm,
+                     const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+    {
+        const double rho = rho_in - 1;
+        const double u_0 = u[0];
+        const double u_1 = u[1];
+        const double u_2 = u[2];
+        
+
+        Equilibrium<UniformGridGPU_LatticeModel>::set(pdf, x, y, z, Vector3<real_t>(u_0, u_1, u_2), rho  + real_t(1) );
+    }
+};
+
+
+template<typename FieldIteratorXYZ >
+struct DensityAndVelocityRange<UniformGridGPU_LatticeModel, FieldIteratorXYZ>
+{
+
+   static void set( FieldIteratorXYZ & begin, const FieldIteratorXYZ & end, const UniformGridGPU_LatticeModel & lm,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+   {
+        for( auto cellIt = begin; cellIt != end; ++cellIt )
+        {
+            const auto x = cellIt.x();
+            const auto y = cellIt.y();
+            const auto z = cellIt.z();
+            const double rho = rho_in - 1;
+            const double u_0 = u[0];
+            const double u_1 = u[1];
+            const double u_2 = u[2];
+            
+
+            Equilibrium<UniformGridGPU_LatticeModel>::set(cellIt, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) );
+        }
+   }
+};
+
+
+
+template<>
+struct DensityAndMomentumDensity<UniformGridGPU_LatticeModel>
+{
+   template< typename FieldPtrOrIterator >
+   static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm,
+                      const FieldPtrOrIterator & it )
+   {
+        const auto x = it.x();
+        const auto y = it.y();
+        const auto z = it.z();
+
+        const real_t f_0 = it[0];
+        const real_t f_1 = it[1];
+        const real_t f_2 = it[2];
+        const real_t f_3 = it[3];
+        const real_t f_4 = it[4];
+        const real_t f_5 = it[5];
+        const real_t f_6 = it[6];
+        const real_t f_7 = it[7];
+        const real_t f_8 = it[8];
+        const real_t f_9 = it[9];
+        const real_t f_10 = it[10];
+        const real_t f_11 = it[11];
+        const real_t f_12 = it[12];
+        const real_t f_13 = it[13];
+        const real_t f_14 = it[14];
+        const real_t f_15 = it[15];
+        const real_t f_16 = it[16];
+        const real_t f_17 = it[17];
+        const real_t f_18 = it[18];
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        momentumDensity[0] = md_0;
+        momentumDensity[1] = md_1;
+        momentumDensity[2] = md_2;
+        
+        return rho;
+   }
+
+   template< typename PdfField_T >
+   static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf,
+                      const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        const real_t f_0 = pdf.getF( &xyz0, 0);
+        const real_t f_1 = pdf.getF( &xyz0, 1);
+        const real_t f_2 = pdf.getF( &xyz0, 2);
+        const real_t f_3 = pdf.getF( &xyz0, 3);
+        const real_t f_4 = pdf.getF( &xyz0, 4);
+        const real_t f_5 = pdf.getF( &xyz0, 5);
+        const real_t f_6 = pdf.getF( &xyz0, 6);
+        const real_t f_7 = pdf.getF( &xyz0, 7);
+        const real_t f_8 = pdf.getF( &xyz0, 8);
+        const real_t f_9 = pdf.getF( &xyz0, 9);
+        const real_t f_10 = pdf.getF( &xyz0, 10);
+        const real_t f_11 = pdf.getF( &xyz0, 11);
+        const real_t f_12 = pdf.getF( &xyz0, 12);
+        const real_t f_13 = pdf.getF( &xyz0, 13);
+        const real_t f_14 = pdf.getF( &xyz0, 14);
+        const real_t f_15 = pdf.getF( &xyz0, 15);
+        const real_t f_16 = pdf.getF( &xyz0, 16);
+        const real_t f_17 = pdf.getF( &xyz0, 17);
+        const real_t f_18 = pdf.getF( &xyz0, 18);
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        momentumDensity[0] = md_0;
+        momentumDensity[1] = md_1;
+        momentumDensity[2] = md_2;
+        
+       return rho;
+   }
+};
+
+
+template<>
+struct MomentumDensity< UniformGridGPU_LatticeModel>
+{
+   template< typename FieldPtrOrIterator >
+   static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const FieldPtrOrIterator & it )
+   {
+        const auto x = it.x();
+        const auto y = it.y();
+        const auto z = it.z();
+
+        const real_t f_0 = it[0];
+        const real_t f_1 = it[1];
+        const real_t f_2 = it[2];
+        const real_t f_3 = it[3];
+        const real_t f_4 = it[4];
+        const real_t f_5 = it[5];
+        const real_t f_6 = it[6];
+        const real_t f_7 = it[7];
+        const real_t f_8 = it[8];
+        const real_t f_9 = it[9];
+        const real_t f_10 = it[10];
+        const real_t f_11 = it[11];
+        const real_t f_12 = it[12];
+        const real_t f_13 = it[13];
+        const real_t f_14 = it[14];
+        const real_t f_15 = it[15];
+        const real_t f_16 = it[16];
+        const real_t f_17 = it[17];
+        const real_t f_18 = it[18];
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        momentumDensity[0] = md_0;
+        momentumDensity[1] = md_1;
+        momentumDensity[2] = md_2;
+        
+   }
+
+   template< typename PdfField_T >
+   static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf,
+                    const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        const real_t f_0 = pdf.getF( &xyz0, 0);
+        const real_t f_1 = pdf.getF( &xyz0, 1);
+        const real_t f_2 = pdf.getF( &xyz0, 2);
+        const real_t f_3 = pdf.getF( &xyz0, 3);
+        const real_t f_4 = pdf.getF( &xyz0, 4);
+        const real_t f_5 = pdf.getF( &xyz0, 5);
+        const real_t f_6 = pdf.getF( &xyz0, 6);
+        const real_t f_7 = pdf.getF( &xyz0, 7);
+        const real_t f_8 = pdf.getF( &xyz0, 8);
+        const real_t f_9 = pdf.getF( &xyz0, 9);
+        const real_t f_10 = pdf.getF( &xyz0, 10);
+        const real_t f_11 = pdf.getF( &xyz0, 11);
+        const real_t f_12 = pdf.getF( &xyz0, 12);
+        const real_t f_13 = pdf.getF( &xyz0, 13);
+        const real_t f_14 = pdf.getF( &xyz0, 14);
+        const real_t f_15 = pdf.getF( &xyz0, 15);
+        const real_t f_16 = pdf.getF( &xyz0, 16);
+        const real_t f_17 = pdf.getF( &xyz0, 17);
+        const real_t f_18 = pdf.getF( &xyz0, 18);
+        const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8;
+        const double vel1Term = f_1 + f_11 + f_15 + f_7;
+        const double vel2Term = f_12 + f_13 + f_5;
+        const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1;
+        const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term;
+        const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term;
+        const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term;
+        momentumDensity[0] = md_0;
+        momentumDensity[1] = md_1;
+        momentumDensity[2] = md_2;
+        
+   }
+};
+
+
+template<>
+struct PressureTensor<UniformGridGPU_LatticeModel>
+{
+   template< typename FieldPtrOrIterator >
+   static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+   }
+
+   template< typename PdfField_T >
+   static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const PdfField_T & /* pdf */,
+                    const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+   }
+};
+
+
+template<>
+struct ShearRate<UniformGridGPU_LatticeModel>
+{
+   template< typename FieldPtrOrIterator >
+   static inline real_t get( const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */,
+                             const Vector3< real_t > & /* velocity */, const real_t /* rho */)
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+
+   template< typename PdfField_T >
+   static inline real_t get( const UniformGridGPU_LatticeModel & latticeModel,
+                             const PdfField_T & /* pdf */, const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */,
+                             const Vector3< real_t > & /* velocity */, const real_t /* rho */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+
+   static inline real_t get( const std::vector< real_t > & /* nonEquilibrium */, const real_t /* relaxationParam */,
+                             const real_t /* rho */ = real_t(1) )
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+};
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+
+#ifdef WALBERLA_CXX_COMPILER_IS_GNU
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6bc2cb24ef5c3c215334a6db9bb0381e2642fcc3
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu
@@ -0,0 +1,324 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file .cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UniformGridGPU_LbKernel.h"
+
+
+#define FUNC_PREFIX __global__
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace pystencils {
+
+namespace internal_UniformGridGPU_LbKernel {
+static FUNC_PREFIX void UniformGridGPU_LbKernel(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x + 1 < _size_pdfs_0 - 1 && blockDim.y*blockIdx.y + threadIdx.y + 1 < _size_pdfs_1 - 1 && blockDim.z*blockIdx.z + threadIdx.z + 1 < _size_pdfs_2 - 1)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x + 1;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y + 1;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z + 1;
+      double * const _data_pdfs_10_21_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      const double xi_18 = -_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * const _data_pdfs_11_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      const double xi_19 = -_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * const _data_pdfs_11_21_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      const double xi_20 = -_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_2m1_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * const _data_pdfs_11_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * const _data_pdfs_1m1_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * const _data_pdfs_10_21_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      const double vel0Term = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      double * const _data_pdfs_1m1_2m1_311 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * const _data_pdfs_1m1_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * const _data_pdfs_1m1_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * const _data_pdfs_1m1_21_315 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      const double vel1Term = _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_2m1_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * const _data_pdfs_11_2m1_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * const _data_pdfs_10_2m1_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      const double vel2Term = _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0] + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2;
+      double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * const _data_pdfs_11_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * const _data_pdfs_10_21_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0];
+      const double xi_27 = rho*-0.333333333333333;
+      const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      const double xi_23 = (u_0*u_0);
+      const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] - _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      const double xi_21 = -u_1;
+      const double xi_24 = (u_1*u_1);
+      const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0];
+      const double xi_22 = -u_2;
+      const double xi_25 = (u_2*u_2);
+      const double u0Mu1 = u_0 + xi_21;
+      const double u0Pu1 = u_0 + u_1;
+      const double u1Pu2 = u_1 + u_2;
+      const double u1Mu2 = u_1 + xi_22;
+      const double u0Mu2 = u_0 + xi_22;
+      const double u0Pu2 = u_0 + u_2;
+      const double f_eq_common = rho - xi_23 - xi_24 - xi_25;
+      const double xi_26 = f_eq_common + rho*-0.666666666666667;
+      const double xi_28 = f_eq_common + xi_25 + xi_27;
+      const double xi_29 = f_eq_common + xi_23 + xi_27;
+      const double xi_30 = f_eq_common + xi_24 + xi_27;
+      const double xi_2 = xi_24*2 + xi_26;
+      const double xi_3 = xi_23*2 + xi_26;
+      const double xi_4 = xi_25*2 + xi_26;
+      const double xi_6 = u0Mu1*2;
+      const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28;
+      const double xi_8 = u0Pu1*2;
+      const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28;
+      const double xi_10 = u1Pu2*2;
+      const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29;
+      const double xi_12 = u1Mu2*2;
+      const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29;
+      const double xi_14 = u0Mu2*2;
+      const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30;
+      const double xi_16 = u0Pu2*2;
+      const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30;
+      const double xi_1 = omega*0.166666666666667;
+      const double xi_5 = omega*0.0416666666666667;
+      double * _data_pdfs_tmp_10_20_30 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2;
+      _data_pdfs_tmp_10_20_30[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_31 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      _data_pdfs_tmp_10_20_31[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_32 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_32[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_11_20_32[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_33 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_33[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_34 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_34[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_35 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_35[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_36 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_36[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_10_21_36[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_37 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_37[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_38 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_38[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_39 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_39[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_310 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_310[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_311 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_311[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_312 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_312[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_313 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_313[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_314 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_314[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_315 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_315[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_316 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_316[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0];
+      double * _data_pdfs_tmp_10_20_317 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_317[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+      double * _data_pdfs_tmp_10_20_318 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_pdfs_tmp_10_20_318[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+   } 
+}
+}
+
+void UniformGridGPU_LbKernel::operator() ( IBlock * block , cudaStream_t stream )
+{
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+    cuda::GPUField<double> * pdfs_tmp;
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find( pdfs );
+    if( it != cache_pdfs_.end() )
+    {
+        pdfs_tmp = *it;
+    }
+    else 
+    {
+        pdfs_tmp = pdfs->cloneUninitialized();
+        cache_pdfs_.insert(pdfs_tmp);
+    }
+
+    WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers()));
+    double * const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    double * _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2));
+    const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2));
+    const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2));
+    const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)));
+    dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 )));
+    internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+    pdfs->swapDataPointers(pdfs_tmp);
+
+}
+
+
+
+void UniformGridGPU_LbKernel::inner( IBlock * block , cudaStream_t stream )
+{
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+    cuda::GPUField<double> * pdfs_tmp;
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find( pdfs );
+    if( it != cache_pdfs_.end() )
+    {
+        pdfs_tmp = *it;
+    }
+    else 
+    {
+        pdfs_tmp = pdfs->cloneUninitialized();
+        cache_pdfs_.insert(pdfs_tmp);
+    }
+
+
+    CellInterval inner = pdfs->xyzSize();
+    inner.expand(-1);
+
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+    double * const _data_pdfs = pdfs->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+    double * _data_pdfs_tmp = pdfs_tmp->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(inner.xSize() + 2));
+    const int64_t _size_pdfs_0 = int64_t(inner.xSize() + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(inner.ySize() + 2));
+    const int64_t _size_pdfs_1 = int64_t(inner.ySize() + 2);
+    WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(inner.zSize() + 2));
+    const int64_t _size_pdfs_2 = int64_t(inner.zSize() + 2);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)));
+    dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 )));
+    internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+
+
+void UniformGridGPU_LbKernel::outer( IBlock * block , cudaStream_t stream  )
+{
+    static std::vector<CellInterval> layers;
+
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+    cuda::GPUField<double> * pdfs_tmp;
+    // Getting temporary field pdfs_tmp
+    auto it = cache_pdfs_.find( pdfs );
+    if( it != cache_pdfs_.end() )
+    {
+        pdfs_tmp = *it;
+    }
+    else 
+    {
+        pdfs_tmp = pdfs->cloneUninitialized();
+        cache_pdfs_.insert(pdfs_tmp);
+    }
+
+
+    if( layers.size() == 0 )
+    {
+        CellInterval ci;
+
+        pdfs->getSliceBeforeGhostLayer(stencil::T, ci, 1, false);
+        layers.push_back(ci);
+        pdfs->getSliceBeforeGhostLayer(stencil::B, ci, 1, false);
+        layers.push_back(ci);
+
+        pdfs->getSliceBeforeGhostLayer(stencil::N, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+        pdfs->getSliceBeforeGhostLayer(stencil::S, ci, 1, false);
+        ci.expand(Cell(0, 0, -1));
+        layers.push_back(ci);
+
+        pdfs->getSliceBeforeGhostLayer(stencil::E, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+        pdfs->getSliceBeforeGhostLayer(stencil::W, ci, 1, false);
+        ci.expand(Cell(0, -1, -1));
+        layers.push_back(ci);
+    }
+
+    
+    {
+        auto parallelSection_ = parallelStreams_.parallelSection( stream );
+        for( auto & ci: layers )
+        {
+            parallelSection_.run([&]( auto s ) {
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers()));
+                double * const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+                WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers()));
+                double * _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0);
+                WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 2));
+                const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 2);
+                WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 2));
+                const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 2);
+                WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 2));
+                const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 2);
+                const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+                const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+                const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+                const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+                dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)));
+                dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 )));
+                internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, s>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+            });
+        }
+    }
+    
+
+    pdfs->swapDataPointers(pdfs_tmp);
+
+}
+
+
+} // namespace pystencils
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc4e13d65e2dc3a43fc41a3afa1924b23dfbdd5f
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h
@@ -0,0 +1,82 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_LbKernel.h
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+
+#include "cuda/GPUField.h"
+#include "cuda/ParallelStreams.h"
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla {
+namespace pystencils {
+
+
+class UniformGridGPU_LbKernel
+{
+public:
+    UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_)
+        : pdfsID(pdfsID_), omega(omega_)
+    {};
+
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+
+    void setOuterPriority(int priority ) {
+        
+        parallelStreams_.setStreamPriority(priority);
+        
+    }
+private:
+    BlockDataID pdfsID;
+    double omega;
+
+    std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_;
+
+    
+    cuda::ParallelStreams parallelStreams_;
+    
+};
+
+
+} // namespace pystencils
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu
new file mode 100644
index 0000000000000000000000000000000000000000..78d9848e0533fa812d4ec28adbc310fbafa0202b
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu
@@ -0,0 +1,121 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_NoSlip.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UniformGridGPU_NoSlip.h"
+#include "cuda/ErrorChecking.h"
+
+
+#define FUNC_PREFIX __global__
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+namespace internal_boundary_UniformGridGPU_NoSlip {
+static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
+   {
+      uint8_t * const _data_indexVector_10 = _data_indexVector;
+      const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
+      const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
+      const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      
+      
+      const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
+      const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
+      const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
+      const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
+      
+      
+      const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+      
+      uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
+      const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
+      double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
+      _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
+   } 
+}
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+
+void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+
+    auto pointer = indexVectors->pointerGpu(type);
+    
+
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+
+    WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+    double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
+    dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
+    internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+
+void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::ALL, stream );
+}
+
+void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::INNER, stream  );
+}
+
+void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::OUTER, stream  );
+}
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h
new file mode 100644
index 0000000000000000000000000000000000000000..536a99a66fd3172c977a2c3acb3bed889fe04c08
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h
@@ -0,0 +1,364 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_NoSlip.h
+//! \\author pystencils
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+
+#include "cuda/GPUField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class UniformGridGPU_NoSlip
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+
+        IndexInfo * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        
+
+        void syncGPU()
+        {
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(int i=0; i < NUM_TYPES; ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+
+        using GpuIndexVector = IndexInfo *;
+        std::vector<GpuIndexVector> gpuVectors_;
+        
+    };
+
+
+    UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip");
+    };
+
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+            if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+        }
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
+
+    BlockDataID indexVectorID;
+
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ada9933626237d4e889b071408e737733521124d
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu
@@ -0,0 +1,1656 @@
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "cuda/GPUField.h"
+#include "core/DataTypes.h"
+#include "UniformGridGPU_PackInfo.h"
+
+
+#define FUNC_PREFIX __global__
+
+
+namespace walberla {
+namespace pystencils {
+
+using walberla::cell::CellInterval;
+using walberla::stencil::Direction;
+
+
+
+namespace internal_pack_N {
+static FUNC_PREFIX void pack_N(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_BE {
+static FUNC_PREFIX void pack_BE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_B {
+static FUNC_PREFIX void pack_B(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_E {
+static FUNC_PREFIX void pack_E(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_W {
+static FUNC_PREFIX void pack_W(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_TN {
+static FUNC_PREFIX void pack_TN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_T {
+static FUNC_PREFIX void pack_T(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_BN {
+static FUNC_PREFIX void pack_BN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_SE {
+static FUNC_PREFIX void pack_SE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_S {
+static FUNC_PREFIX void pack_S(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0];
+      double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_BW {
+static FUNC_PREFIX void pack_BW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_BS {
+static FUNC_PREFIX void pack_BS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_TW {
+static FUNC_PREFIX void pack_TW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_C {
+static FUNC_PREFIX void pack_C(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_NE {
+static FUNC_PREFIX void pack_NE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_TS {
+static FUNC_PREFIX void pack_TS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_NW {
+static FUNC_PREFIX void pack_NW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_SW {
+static FUNC_PREFIX void pack_SW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+namespace internal_pack_TE {
+static FUNC_PREFIX void pack_TE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0];
+   } 
+}
+}
+
+
+
+namespace internal_unpack_S {
+static FUNC_PREFIX void unpack_S(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_TW {
+static FUNC_PREFIX void unpack_TW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_T {
+static FUNC_PREFIX void unpack_T(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_W {
+static FUNC_PREFIX void unpack_W(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_E {
+static FUNC_PREFIX void unpack_E(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_BS {
+static FUNC_PREFIX void unpack_BS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_B {
+static FUNC_PREFIX void unpack_B(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_TS {
+static FUNC_PREFIX void unpack_TS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_NW {
+static FUNC_PREFIX void unpack_NW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_N {
+static FUNC_PREFIX void unpack_N(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x];
+      double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1];
+      double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2];
+      double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3];
+      double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4];
+   } 
+}
+}
+
+namespace internal_unpack_TE {
+static FUNC_PREFIX void unpack_TE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_TN {
+static FUNC_PREFIX void unpack_TN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_BE {
+static FUNC_PREFIX void unpack_BE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_C {
+static FUNC_PREFIX void unpack_C(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2;
+      _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_SW {
+static FUNC_PREFIX void unpack_SW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_BN {
+static FUNC_PREFIX void unpack_BN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_SE {
+static FUNC_PREFIX void unpack_SE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_NE {
+static FUNC_PREFIX void unpack_NE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+namespace internal_unpack_BW {
+static FUNC_PREFIX void unpack_BW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2)
+   {
+      const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x;
+      const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y;
+      const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z;
+      double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x];
+   } 
+}
+}
+
+
+
+
+void UniformGridGPU_PackInfo::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+{
+    double * buffer = reinterpret_cast<double*>(byte_buffer);
+
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+
+    CellInterval ci;
+    pdfs->getSliceBeforeGhostLayer(dir, ci, 1, false);
+
+    switch( dir )
+    {
+        case stencil::N:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_N::pack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BE:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_BE::pack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::B:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_B::pack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::E:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_E::pack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::W:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_W::pack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TN:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_TN::pack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::T:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_T::pack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BN:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_BN::pack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::SE:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_SE::pack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::S:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_S::pack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BW:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_BW::pack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BS:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_BS::pack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TW:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_TW::pack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::C:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_C::pack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2);
+            break;
+        }
+        
+        case stencil::NE:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_NE::pack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TS:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_TS::pack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::NW:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_NW::pack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::SW:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_SW::pack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TE:
+        {
+            double * _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_pack_TE::pack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+void UniformGridGPU_PackInfo::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+{
+    double * buffer = reinterpret_cast<double*>(byte_buffer);
+
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+
+    CellInterval ci;
+    pdfs->getGhostRegion(dir, ci, 1, false);
+
+    switch( dir )
+    {
+        case stencil::S:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_S::unpack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TW:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_TW::unpack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::T:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_T::unpack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::W:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_W::unpack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::E:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_E::unpack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BS:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_BS::unpack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::B:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_B::unpack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TS:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_TS::unpack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::NW:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_NW::unpack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::N:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_N::unpack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TE:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_TE::unpack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::TN:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_TN::unpack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BE:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_BE::unpack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::C:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_C::unpack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2);
+            break;
+        }
+        
+        case stencil::SW:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_SW::unpack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BN:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_BN::unpack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::SE:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_SE::unpack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::NE:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_NE::unpack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+        case stencil::BW:
+        {
+            double * const _data_buffer = buffer;
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()));
+            WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()));
+            double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0));
+            const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0));
+            const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0);
+            WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0));
+            const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0);
+            const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+            const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+            const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+            const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+            dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)));
+            dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 )));
+            internal_unpack_BW::unpack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3);
+            break;
+        }
+        
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+uint_t UniformGridGPU_PackInfo::size(stencil::Direction dir, IBlock * block)
+{
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+
+    CellInterval ci;
+    pdfs->getGhostRegion(dir, ci, 1, false);
+
+    uint_t elementsPerCell = 0;
+
+    switch( dir )
+    {
+        case stencil::N:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::BE:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::B:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::E:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::W:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::TN:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::T:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::BN:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::SE:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::S:
+            elementsPerCell = 5;
+            break;
+        
+        case stencil::BW:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::BS:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::TW:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::C:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::NE:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::TS:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::NW:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::SW:
+            elementsPerCell = 1;
+            break;
+        
+        case stencil::TE:
+            elementsPerCell = 1;
+            break;
+        
+        default:
+            elementsPerCell = 0;
+    }
+    return ci.numCells() * elementsPerCell * sizeof( double );
+}
+
+
+
+} // namespace pystencils
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c9ab98656f0af45079e2e6dca0f2a6b37e5e911
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h
@@ -0,0 +1,34 @@
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "cuda/GPUField.h"
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+
+
+#define FUNC_PREFIX __global__
+
+
+namespace walberla {
+namespace pystencils {
+
+
+class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
+{
+public:
+    UniformGridGPU_PackInfo( BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {};
+
+
+    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual uint_t size  (stencil::Direction dir, IBlock * block);
+
+private:
+    BlockDataID pdfsID;
+};
+
+
+} // namespace pystencils
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a774d25115772e39e236ecc6812e8612ded0d317
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu
@@ -0,0 +1,121 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_UBB.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UniformGridGPU_UBB.h"
+#include "cuda/ErrorChecking.h"
+
+
+#define FUNC_PREFIX __global__
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+namespace internal_boundary_UniformGridGPU_UBB {
+static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
+   {
+      uint8_t * const _data_indexVector_10 = _data_indexVector;
+      const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
+      const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
+      const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      
+      
+      const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
+      const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
+      const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
+      const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
+      
+      
+      const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+      
+      uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
+      const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
+      double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
+      _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
+   } 
+}
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+
+void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+
+    auto pointer = indexVectors->pointerGpu(type);
+    
+
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+
+    WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+    double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
+    dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
+    internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+
+void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::ALL, stream );
+}
+
+void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::INNER, stream  );
+}
+
+void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::OUTER, stream  );
+}
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ad393854b98e951a08a75a5632a9fa4a6b210ed
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h
@@ -0,0 +1,364 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_UBB.h
+//! \\author pystencils
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+
+#include "cuda/GPUField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class UniformGridGPU_UBB
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+
+        IndexInfo * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        
+
+        void syncGPU()
+        {
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(int i=0; i < NUM_TYPES; ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+
+        using GpuIndexVector = IndexInfo *;
+        std::vector<GpuIndexVector> gpuVectors_;
+        
+    };
+
+
+    UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB");
+    };
+
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+            if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+            if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            
+        }
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
+
+    BlockDataID indexVectorID;
+
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file