Skip to content
Snippets Groups Projects
Commit 77aed6a0 authored by Martin Bauer's avatar Martin Bauer
Browse files

Added generated files to GPU benchmark

parent 997ec0b7
Branches
Tags
No related merge requests found
Showing
with 4359 additions and 2 deletions
...@@ -2,5 +2,7 @@ ...@@ -2,5 +2,7 @@
waLBerla_link_files_to_builddir( "*.prm" ) waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
FILES UniformGridGPU.cpp UniformGridGPU.gen.py FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp
UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu
UniformGridGPU_PackInfo.cu
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk ) DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Parameters Parameters
{ {
omega 1.8; omega 1.8;
timesteps 10; timesteps 1000;
remainingTimeLoggerFrequency 3; remainingTimeLoggerFrequency 3;
vtkWriteFrequency 0; vtkWriteFrequency 0;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_LbKernel.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "cuda/ParallelStreams.h"
#include "field/SwapableCompare.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include <set>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-parameter"
#endif
namespace walberla {
namespace pystencils {
class UniformGridGPU_LbKernel
{
public:
UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_)
: pdfsID(pdfsID_), omega(omega_)
{};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
void setOuterPriority(int priority ) {
parallelStreams_.setStreamPriority(priority);
}
private:
BlockDataID pdfsID;
double omega;
std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_;
cuda::ParallelStreams parallelStreams_;
};
} // namespace pystencils
} // namespace walberla
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic pop
#endif
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.cpp
//! \\ingroup lbm
//! \\author lbmpy
//======================================================================================================================
#include <cmath>
#include "core/DataTypes.h"
#include "core/Macros.h"
#include "UniformGridGPU_NoSlip.h"
#include "cuda/ErrorChecking.h"
#define FUNC_PREFIX __global__
using namespace std;
namespace walberla {
namespace lbm {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wconversion"
#endif
namespace internal_boundary_UniformGridGPU_NoSlip {
static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
{
if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
{
uint8_t * const _data_indexVector_10 = _data_indexVector;
const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
_data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
}
}
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
{
auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
auto pointer = indexVectors->pointerGpu(type);
int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
if( indexVectorSize == 0)
return;
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
}
void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::ALL, stream );
}
void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::INNER, stream );
}
void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::OUTER, stream );
}
} // namespace lbm
} // namespace walberla
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "blockforest/StructuredBlockForest.h"
#include "field/FlagField.h"
#include <set>
#include <vector>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
namespace walberla {
namespace lbm {
class UniformGridGPU_NoSlip
{
public:
struct IndexInfo {
int32_t x;
int32_t y;
int32_t z;
int32_t dir;
IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
bool operator==(const IndexInfo & o) const {
return x == o.x && y == o.y && z == o.z && dir == o.dir;
}
};
class IndexVectors
{
public:
using CpuIndexVector = std::vector<IndexInfo>;
enum Type {
ALL = 0,
INNER = 1,
OUTER = 2,
NUM_TYPES = 3
};
IndexVectors() : cpuVectors_(NUM_TYPES) {}
bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
~IndexVectors() {
for( auto & gpuVec: gpuVectors_)
cudaFree( gpuVec );
}
CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; }
void syncGPU()
{
gpuVectors_.resize( cpuVectors_.size() );
for(int i=0; i < NUM_TYPES; ++i )
{
auto & gpuVec = gpuVectors_[i];
auto & cpuVec = cpuVectors_[i];
cudaFree( gpuVec );
cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
}
}
private:
std::vector<CpuIndexVector> cpuVectors_;
using GpuIndexVector = IndexInfo *;
std::vector<GpuIndexVector> gpuVectors_;
};
UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks,
BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{
auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip");
};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
template<typename FlagField_T>
void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID)
{
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
}
template<typename FlagField_T>
void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID )
{
auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
auto * flagField = block->getData< FlagField_T > ( flagFieldID );
auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
auto domainFlag = flagField->getFlag(domainFlagUID);
auto inner = flagField->xyzSize();
inner.expand( cell_idx_t(-1) );
indexVectorAll.clear();
indexVectorInner.clear();
indexVectorOuter.clear();
for( auto it = flagField->begin(); it != flagField->end(); ++it )
{
if( ! isFlagSet(it, domainFlag) )
continue;
if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 0 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 1 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 2 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 3 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 4 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 5 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 6 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 7 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 8 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 9 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 10 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 11 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 12 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 13 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 14 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 15 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 16 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 17 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 18 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
}
indexVectors->syncGPU();
}
private:
void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
BlockDataID indexVectorID;
BlockDataID pdfsID;
};
} // namespace lbm
} // namespace walberla
\ No newline at end of file
This diff is collapsed.
#include "stencil/Directions.h"
#include "core/cell/CellInterval.h"
#include "cuda/GPUField.h"
#include "core/DataTypes.h"
#include "domain_decomposition/IBlock.h"
#include "cuda/communication/GeneratedGPUPackInfo.h"
#define FUNC_PREFIX __global__
namespace walberla {
namespace pystencils {
class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
{
public:
UniformGridGPU_PackInfo( BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{};
virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
virtual uint_t size (stencil::Direction dir, IBlock * block);
private:
BlockDataID pdfsID;
};
} // namespace pystencils
} // namespace walberla
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_UBB.cpp
//! \\ingroup lbm
//! \\author lbmpy
//======================================================================================================================
#include <cmath>
#include "core/DataTypes.h"
#include "core/Macros.h"
#include "UniformGridGPU_UBB.h"
#include "cuda/ErrorChecking.h"
#define FUNC_PREFIX __global__
using namespace std;
namespace walberla {
namespace lbm {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wconversion"
#endif
namespace internal_boundary_UniformGridGPU_UBB {
static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
{
if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
{
uint8_t * const _data_indexVector_10 = _data_indexVector;
const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
_data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x];
}
}
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
{
auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
auto pointer = indexVectors->pointerGpu(type);
int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
if( indexVectorSize == 0)
return;
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
}
void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::ALL, stream );
}
void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::INNER, stream );
}
void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::OUTER, stream );
}
} // namespace lbm
} // namespace walberla
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_UBB.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "blockforest/StructuredBlockForest.h"
#include "field/FlagField.h"
#include <set>
#include <vector>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
namespace walberla {
namespace lbm {
class UniformGridGPU_UBB
{
public:
struct IndexInfo {
int32_t x;
int32_t y;
int32_t z;
int32_t dir;
IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
bool operator==(const IndexInfo & o) const {
return x == o.x && y == o.y && z == o.z && dir == o.dir;
}
};
class IndexVectors
{
public:
using CpuIndexVector = std::vector<IndexInfo>;
enum Type {
ALL = 0,
INNER = 1,
OUTER = 2,
NUM_TYPES = 3
};
IndexVectors() : cpuVectors_(NUM_TYPES) {}
bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
~IndexVectors() {
for( auto & gpuVec: gpuVectors_)
cudaFree( gpuVec );
}
CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; }
void syncGPU()
{
gpuVectors_.resize( cpuVectors_.size() );
for(int i=0; i < NUM_TYPES; ++i )
{
auto & gpuVec = gpuVectors_[i];
auto & cpuVec = cpuVectors_[i];
cudaFree( gpuVec );
cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
}
}
private:
std::vector<CpuIndexVector> cpuVectors_;
using GpuIndexVector = IndexInfo *;
std::vector<GpuIndexVector> gpuVectors_;
};
UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks,
BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{
auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB");
};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
template<typename FlagField_T>
void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID)
{
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
}
template<typename FlagField_T>
void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID )
{
auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
auto * flagField = block->getData< FlagField_T > ( flagFieldID );
auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
auto domainFlag = flagField->getFlag(domainFlagUID);
auto inner = flagField->xyzSize();
inner.expand( cell_idx_t(-1) );
indexVectorAll.clear();
indexVectorInner.clear();
indexVectorOuter.clear();
for( auto it = flagField->begin(); it != flagField->end(); ++it )
{
if( ! isFlagSet(it, domainFlag) )
continue;
if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 0 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 1 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 2 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 3 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 4 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 5 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 6 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 7 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 8 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 9 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 10 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 11 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 12 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 13 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 14 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 15 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 16 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 17 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 18 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
}
indexVectors->syncGPU();
}
private:
void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
BlockDataID indexVectorID;
BlockDataID pdfsID;
};
} // namespace lbm
} // namespace walberla
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment