Added generated files to GPU benchmark

77aed6a0 · Martin Bauer · 997ec0b7 · 77aed6a0 · 77aed6a0 · 77aed6a0
Commit 77aed6a0 authored 6 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -2,5 +2,7 @@
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
-                          FILES UniformGridGPU.cpp UniformGridGPU.gen.py
+                          FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp
+                                UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu
+                                UniformGridGPU_PackInfo.cu
                          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -2,7 +2,7 @@
 Parameters 
 {
 	omega           1.8;
-	timesteps       10;
+	timesteps       1000;
 	remainingTimeLoggerFrequency 3;
 	vtkWriteFrequency 0;

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_LbKernel.h
+//! \\author pystencils
+//======================================================================================================================
+#include "core/DataTypes.h"
+#include "cuda/GPUField.h"
+#include "cuda/ParallelStreams.h"
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include <set>
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+namespace walberla {
+namespace pystencils {
+class UniformGridGPU_LbKernel
+{
+public:
+    UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_)
+        : pdfsID(pdfsID_), omega(omega_)
+    {};
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+    void setOuterPriority(int priority ) {
+        parallelStreams_.setStreamPriority(priority);
+    }
+private:
+    BlockDataID pdfsID;
+    double omega;
+    std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_;
+    cuda::ParallelStreams parallelStreams_;
+};
+} // namespace pystencils
+} // namespace walberla
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_NoSlip.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+#include <cmath>
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UniformGridGPU_NoSlip.h"
+#include "cuda/ErrorChecking.h"
+#define FUNC_PREFIX __global__
+using namespace std;
+namespace walberla {
+namespace lbm {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+namespace internal_boundary_UniformGridGPU_NoSlip {
+static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
+   {
+      uint8_t * const _data_indexVector_10 = _data_indexVector;
+      const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
+      const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
+      const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
+      const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
+      const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
+      const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
+      const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+      uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
+      const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
+      double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
+      _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
+   } 
+}
+}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+    auto pointer = indexVectors->pointerGpu(type);
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+    WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+    double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
+    dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
+    internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::ALL, stream );
+}
+void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::INNER, stream  );
+}
+void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::OUTER, stream  );
+}
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_NoSlip.h
+//! \\author pystencils
+//======================================================================================================================
+#include "core/DataTypes.h"
+#include "cuda/GPUField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include <set>
+#include <vector>
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+namespace walberla {
+namespace lbm {
+class UniformGridGPU_NoSlip
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+        IndexInfo * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        void syncGPU()
+        {
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(int i=0; i < NUM_TYPES; ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+        }
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+        using GpuIndexVector = IndexInfo *;
+        std::vector<GpuIndexVector> gpuVectors_;
+    };
+    UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip");
+    };
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+            if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+        }
+        indexVectors->syncGPU();
+    }
+private:
+    void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
+    BlockDataID indexVectorID;
+    BlockDataID pdfsID;
+};
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "cuda/GPUField.h"
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+#define FUNC_PREFIX __global__
+namespace walberla {
+namespace pystencils {
+class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
+{
+public:
+    UniformGridGPU_PackInfo( BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {};
+    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual uint_t size  (stencil::Direction dir, IBlock * block);
+private:
+    BlockDataID pdfsID;
+};
+} // namespace pystencils
+} // namespace walberla
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_UBB.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+#include <cmath>
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UniformGridGPU_UBB.h"
+#include "cuda/ErrorChecking.h"
+#define FUNC_PREFIX __global__
+using namespace std;
+namespace walberla {
+namespace lbm {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+namespace internal_boundary_UniformGridGPU_UBB {
+static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
+{
+   if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
+   {
+      uint8_t * const _data_indexVector_10 = _data_indexVector;
+      const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
+      const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
+      const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
+      const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
+      const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
+      const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
+      const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+      uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
+      const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
+      double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
+      double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
+      _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
+   } 
+}
+}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+    auto pointer = indexVectors->pointerGpu(type);
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+    auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
+    WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
+    double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
+    dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
+    dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
+    internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+}
+void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::ALL, stream );
+}
+void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::INNER, stream  );
+}
+void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream  )
+{
+    run( block, IndexVectors::OUTER, stream  );
+}
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UniformGridGPU_UBB.h
+//! \\author pystencils
+//======================================================================================================================
+#include "core/DataTypes.h"
+#include "cuda/GPUField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include <set>
+#include <vector>
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+namespace walberla {
+namespace lbm {
+class UniformGridGPU_UBB
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+        IndexInfo * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        void syncGPU()
+        {
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(int i=0; i < NUM_TYPES; ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+        }
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+        using GpuIndexVector = IndexInfo *;
+        std::vector<GpuIndexVector> gpuVectors_;
+    };
+    UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_ )
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB");
+    };
+    void operator() ( IBlock * block , cudaStream_t stream = 0 );
+    void inner( IBlock * block , cudaStream_t stream = 0 );
+    void outer( IBlock * block , cudaStream_t stream = 0 );
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+            if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+            {
+                auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+        }
+        indexVectors->syncGPU();
+    }
+private:
+    void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
+    BlockDataID indexVectorID;
+    BlockDataID pdfsID;
+};
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file