Commit ddc0a76e authored by Martin Bauer's avatar Martin Bauer
Browse files

Code generation: new API

- information from CMake to codegen: double/float, OpenMP, ...
parent 83a55d9a
Pipeline #13259 canceled with stages
in 103 minutes and 15 seconds
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_python_file_generates(UniformGridGPU.py
UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
)
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp
UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu
UniformGridGPU_PackInfo.cu
FILES UniformGridGPU.cpp UniformGridGPU.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
import sympy as sp
from lbmpy_walberla import generate_lattice_model_files
from lbmpy.creationfunctions import create_lb_update_rule
from pystencils_walberla.sweep import Sweep
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method
from lbmpy_walberla.boundary import create_boundary_class
from pystencils_walberla.cmake_integration import codegen
dtype = 'float64'
# LB options
options = {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': sp.Symbol("omega"),
'field_name': 'pdfs',
'compressible': False,
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': True,
'cse_pdfs': True,
'double_precision': dtype == 'float64'}
}
# GPU optimization options
inner_opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype}
def lb_assignments():
ur = create_lb_update_rule(**options)
return ur.all_assignments
def genBoundary():
boundary = UBB([0.05, 0, 0], dim=3, name="UniformGridGPU_UBB")
return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
def genNoSlip():
boundary = NoSlip(name='UniformGridGPU_NoSlip')
return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
generate_lattice_model_files(class_name='UniformGridGPU_LatticeModel', **options)
Sweep.generate_inner_outer_kernel('UniformGridGPU_LbKernel',
lambda: create_lb_update_rule(**options).all_assignments,
target='gpu',
temporary_fields=['pdfs_tmp'],
field_swaps=[('pdfs', 'pdfs_tmp')],
optimization=inner_opt,
outer_optimization=outer_opt)
Sweep.generate_pack_info('UniformGridGPU_PackInfo', lb_assignments, target='gpu')
codegen.register(['UniformGridGPU_UBB.h', 'UniformGridGPU_UBB.cu'], genBoundary)
codegen.register(['UniformGridGPU_NoSlip.h', 'UniformGridGPU_NoSlip.cu'], genNoSlip)
import sympy as sp
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.boundaries import NoSlip, UBB
from pystencils_walberla import generate_pack_info_from_kernel
from lbmpy_walberla import generate_lattice_model, generate_boundary
from pystencils_walberla import CodeGeneration, generate_sweep
with CodeGeneration() as ctx:
# LB options
options = {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': sp.Symbol("omega"),
'field_name': 'pdfs',
'compressible': False,
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': True,
'cse_pdfs': True,
'gpu_indexing_params': {'block_size': (128, 1, 1)}}
}
lb_method = create_lb_method(**options)
update_rule = create_lb_update_rule(lb_method=lb_method, **options)
# CPU lattice model - required for macroscopic value computation, VTK output etc.
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method)
# gpu LB sweep & boundaries
generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')],
inner_outer_split=True, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
Parameters
{
omega 1.8;
timesteps 2;
remainingTimeLoggerFrequency 3;
vtkWriteFrequency 0;
overlapCommunication false;
cudaEnabledMPI false;
}
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 50, 20, 10 >;
periodic < 0, 0, 1 >;
}
Boundaries
{
Border { direction W; walldistance -1; flag NoSlip; }
Border { direction E; walldistance -1; flag NoSlip; }
Border { direction S; walldistance -1; flag NoSlip; }
Border { direction N; walldistance -1; flag UBB; }
}
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_LbKernel.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "cuda/ParallelStreams.h"
#include "field/SwapableCompare.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include <set>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-parameter"
#endif
namespace walberla {
namespace pystencils {
class UniformGridGPU_LbKernel
{
public:
UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_)
: pdfsID(pdfsID_), omega(omega_)
{};
~UniformGridGPU_LbKernel() {
for(auto p: cache_pdfs_) {
delete p;
}
}
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
void setOuterPriority(int priority ) {
parallelStreams_.setStreamPriority(priority);
}
private:
BlockDataID pdfsID;
double omega;
std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_;
cuda::ParallelStreams parallelStreams_;
};
} // namespace pystencils
} // namespace walberla
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic pop
#endif
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.cpp
//! \\ingroup lbm
//! \\author lbmpy
//======================================================================================================================
#include <cmath>
#include "core/DataTypes.h"
#include "core/Macros.h"
#include "UniformGridGPU_NoSlip.h"
#include "cuda/ErrorChecking.h"
#define FUNC_PREFIX __global__
using namespace std;
namespace walberla {
namespace lbm {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#ifdef __CUDACC__
#pragma push
#pragma diag_suppress = declared_but_not_referenced
#endif
namespace internal_boundary_UniformGridGPU_NoSlip {
static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
{
if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
{
uint8_t * const _data_indexVector_10 = _data_indexVector;
const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
double * _data_pdfsf9cc34cc4e2b6261 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
double * _data_pdfs_10_2011ac6bf6446d4afa = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
_data_pdfsf9cc34cc4e2b6261[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_2011ac6bf6446d4afa[_stride_pdfs_0*x];
}
}
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef __CUDACC__
#pragma pop
#endif
void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
{
auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
auto pointer = indexVectors->pointerGpu(type);
int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
if( indexVectorSize == 0)
return;
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
dim3 _grid(int(( (indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
}
void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::ALL, stream );
}
void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::INNER, stream );
}
void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::OUTER, stream );
}
} // namespace lbm
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "blockforest/StructuredBlockForest.h"
#include "field/FlagField.h"
#include <set>
#include <vector>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
namespace walberla {
namespace lbm {
class UniformGridGPU_NoSlip
{
public:
struct IndexInfo {
int32_t x;
int32_t y;
int32_t z;
int32_t dir;
IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
bool operator==(const IndexInfo & o) const {
return x == o.x && y == o.y && z == o.z && dir == o.dir;
}
};
class IndexVectors
{
public:
using CpuIndexVector = std::vector<IndexInfo>;
enum Type {
ALL = 0,
INNER = 1,
OUTER = 2,
NUM_TYPES = 3
};
IndexVectors() : cpuVectors_(NUM_TYPES) {}
bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
~IndexVectors() {
for( auto & gpuVec: gpuVectors_)
cudaFree( gpuVec );
}
CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; }
void syncGPU()
{
gpuVectors_.resize( cpuVectors_.size() );
for(size_t i=0; i < size_t(NUM_TYPES); ++i )
{
auto & gpuVec = gpuVectors_[i];
auto & cpuVec = cpuVectors_[i];
cudaFree( gpuVec );
cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
}
}
private:
std::vector<CpuIndexVector> cpuVectors_;
using GpuIndexVector = IndexInfo *;
std::vector<GpuIndexVector> gpuVectors_;
};
UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks,
BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{
auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip");
};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
template<typename FlagField_T>
void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID)
{
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
}
template<typename FlagField_T>
void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID )
{
auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
auto * flagField = block->getData< FlagField_T > ( flagFieldID );
auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
auto domainFlag = flagField->getFlag(domainFlagUID);
auto inner = flagField->xyzSize();
inner.expand( cell_idx_t(-1) );
indexVectorAll.clear();
indexVectorInner.clear();
indexVectorOuter.clear();
for( auto it = flagField->begin(); it != flagField->end(); ++it )
{
if( ! isFlagSet(it, domainFlag) )
continue;
if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 0 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 1 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 2 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 3 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 4 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}