Skip to content
Snippets Groups Projects
Commit ddc0a76e authored by Martin Bauer's avatar Martin Bauer
Browse files

Code generation: new API

- information from CMake to codegen: double/float, OpenMP, ...
parent 83a55d9a
No related merge requests found
Pipeline #13232 passed with stages
in 4 hours and 13 seconds
Showing
with 103 additions and 4783 deletions
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_python_file_generates(UniformGridGPU.py
UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
)
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp
UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu
UniformGridGPU_PackInfo.cu
FILES UniformGridGPU.cpp UniformGridGPU.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
import sympy as sp
from lbmpy_walberla import generate_lattice_model_files
from lbmpy.creationfunctions import create_lb_update_rule
from pystencils_walberla.sweep import Sweep
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method
from lbmpy_walberla.boundary import create_boundary_class
from pystencils_walberla.cmake_integration import codegen
dtype = 'float64'
# LB options
options = {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': sp.Symbol("omega"),
'field_name': 'pdfs',
'compressible': False,
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': True,
'cse_pdfs': True,
'double_precision': dtype == 'float64'}
}
# GPU optimization options
inner_opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype}
def lb_assignments():
ur = create_lb_update_rule(**options)
return ur.all_assignments
def genBoundary():
boundary = UBB([0.05, 0, 0], dim=3, name="UniformGridGPU_UBB")
return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
def genNoSlip():
boundary = NoSlip(name='UniformGridGPU_NoSlip')
return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
generate_lattice_model_files(class_name='UniformGridGPU_LatticeModel', **options)
Sweep.generate_inner_outer_kernel('UniformGridGPU_LbKernel',
lambda: create_lb_update_rule(**options).all_assignments,
target='gpu',
temporary_fields=['pdfs_tmp'],
field_swaps=[('pdfs', 'pdfs_tmp')],
optimization=inner_opt,
outer_optimization=outer_opt)
Sweep.generate_pack_info('UniformGridGPU_PackInfo', lb_assignments, target='gpu')
codegen.register(['UniformGridGPU_UBB.h', 'UniformGridGPU_UBB.cu'], genBoundary)
codegen.register(['UniformGridGPU_NoSlip.h', 'UniformGridGPU_NoSlip.cu'], genNoSlip)
import sympy as sp
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.boundaries import NoSlip, UBB
from pystencils_walberla import generate_pack_info_from_kernel
from lbmpy_walberla import generate_lattice_model, generate_boundary
from pystencils_walberla import CodeGeneration, generate_sweep
with CodeGeneration() as ctx:
# LB options
options = {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': sp.Symbol("omega"),
'field_name': 'pdfs',
'compressible': False,
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': True,
'cse_pdfs': True,
'gpu_indexing_params': {'block_size': (128, 1, 1)}}
}
lb_method = create_lb_method(**options)
update_rule = create_lb_update_rule(lb_method=lb_method, **options)
# CPU lattice model - required for macroscopic value computation, VTK output etc.
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method)
# gpu LB sweep & boundaries
generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')],
inner_outer_split=True, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
Parameters
{
omega 1.8;
timesteps 2;
remainingTimeLoggerFrequency 3;
vtkWriteFrequency 0;
overlapCommunication false;
cudaEnabledMPI false;
}
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 50, 20, 10 >;
periodic < 0, 0, 1 >;
}
Boundaries
{
Border { direction W; walldistance -1; flag NoSlip; }
Border { direction E; walldistance -1; flag NoSlip; }
Border { direction S; walldistance -1; flag NoSlip; }
Border { direction N; walldistance -1; flag UBB; }
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_LbKernel.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "cuda/ParallelStreams.h"
#include "field/SwapableCompare.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include <set>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-parameter"
#endif
namespace walberla {
namespace pystencils {
class UniformGridGPU_LbKernel
{
public:
UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_)
: pdfsID(pdfsID_), omega(omega_)
{};
~UniformGridGPU_LbKernel() {
for(auto p: cache_pdfs_) {
delete p;
}
}
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
void setOuterPriority(int priority ) {
parallelStreams_.setStreamPriority(priority);
}
private:
BlockDataID pdfsID;
double omega;
std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_;
cuda::ParallelStreams parallelStreams_;
};
} // namespace pystencils
} // namespace walberla
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic pop
#endif
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.cpp
//! \\ingroup lbm
//! \\author lbmpy
//======================================================================================================================
#include <cmath>
#include "core/DataTypes.h"
#include "core/Macros.h"
#include "UniformGridGPU_NoSlip.h"
#include "cuda/ErrorChecking.h"
#define FUNC_PREFIX __global__
using namespace std;
namespace walberla {
namespace lbm {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#ifdef __CUDACC__
#pragma push
#pragma diag_suppress = declared_but_not_referenced
#endif
namespace internal_boundary_UniformGridGPU_NoSlip {
static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
{
if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
{
uint8_t * const _data_indexVector_10 = _data_indexVector;
const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
double * _data_pdfsf9cc34cc4e2b6261 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
double * _data_pdfs_10_2011ac6bf6446d4afa = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
_data_pdfsf9cc34cc4e2b6261[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_2011ac6bf6446d4afa[_stride_pdfs_0*x];
}
}
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef __CUDACC__
#pragma pop
#endif
void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
{
auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
auto pointer = indexVectors->pointerGpu(type);
int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
if( indexVectorSize == 0)
return;
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
dim3 _grid(int(( (indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
}
void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::ALL, stream );
}
void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::INNER, stream );
}
void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::OUTER, stream );
}
} // namespace lbm
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_NoSlip.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "blockforest/StructuredBlockForest.h"
#include "field/FlagField.h"
#include <set>
#include <vector>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
namespace walberla {
namespace lbm {
class UniformGridGPU_NoSlip
{
public:
struct IndexInfo {
int32_t x;
int32_t y;
int32_t z;
int32_t dir;
IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
bool operator==(const IndexInfo & o) const {
return x == o.x && y == o.y && z == o.z && dir == o.dir;
}
};
class IndexVectors
{
public:
using CpuIndexVector = std::vector<IndexInfo>;
enum Type {
ALL = 0,
INNER = 1,
OUTER = 2,
NUM_TYPES = 3
};
IndexVectors() : cpuVectors_(NUM_TYPES) {}
bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
~IndexVectors() {
for( auto & gpuVec: gpuVectors_)
cudaFree( gpuVec );
}
CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; }
void syncGPU()
{
gpuVectors_.resize( cpuVectors_.size() );
for(size_t i=0; i < size_t(NUM_TYPES); ++i )
{
auto & gpuVec = gpuVectors_[i];
auto & cpuVec = cpuVectors_[i];
cudaFree( gpuVec );
cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
}
}
private:
std::vector<CpuIndexVector> cpuVectors_;
using GpuIndexVector = IndexInfo *;
std::vector<GpuIndexVector> gpuVectors_;
};
UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks,
BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{
auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip");
};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
template<typename FlagField_T>
void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID)
{
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
}
template<typename FlagField_T>
void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID )
{
auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
auto * flagField = block->getData< FlagField_T > ( flagFieldID );
auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
auto domainFlag = flagField->getFlag(domainFlagUID);
auto inner = flagField->xyzSize();
inner.expand( cell_idx_t(-1) );
indexVectorAll.clear();
indexVectorInner.clear();
indexVectorOuter.clear();
for( auto it = flagField->begin(); it != flagField->end(); ++it )
{
if( ! isFlagSet(it, domainFlag) )
continue;
if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 0 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 1 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 2 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 3 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 4 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 5 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 6 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 7 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 8 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 9 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 10 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 11 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 12 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 13 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 14 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 15 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 16 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 17 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 18 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
}
indexVectors->syncGPU();
}
private:
void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
BlockDataID indexVectorID;
BlockDataID pdfsID;
};
} // namespace lbm
} // namespace walberla
\ No newline at end of file
This diff is collapsed.
#include "stencil/Directions.h"
#include "core/cell/CellInterval.h"
#include "cuda/GPUField.h"
#include "core/DataTypes.h"
#include "domain_decomposition/IBlock.h"
#include "cuda/communication/GeneratedGPUPackInfo.h"
#define FUNC_PREFIX __global__
namespace walberla {
namespace pystencils {
class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
{
public:
UniformGridGPU_PackInfo( BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{};
virtual ~UniformGridGPU_PackInfo() {}
virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
virtual uint_t size (stencil::Direction dir, IBlock * block);
private:
BlockDataID pdfsID;
};
} // namespace pystencils
} // namespace walberla
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_UBB.cpp
//! \\ingroup lbm
//! \\author lbmpy
//======================================================================================================================
#include <cmath>
#include "core/DataTypes.h"
#include "core/Macros.h"
#include "UniformGridGPU_UBB.h"
#include "cuda/ErrorChecking.h"
#define FUNC_PREFIX __global__
using namespace std;
namespace walberla {
namespace lbm {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#ifdef __CUDACC__
#pragma push
#pragma diag_suppress = declared_but_not_referenced
#endif
namespace internal_boundary_UniformGridGPU_UBB {
static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize)
{
if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize)
{
uint8_t * const _data_indexVector_10 = _data_indexVector;
const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_14 = _data_indexVector + 4;
const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
uint8_t * const _data_indexVector_18 = _data_indexVector + 8;
const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 };
const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 };
const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 };
const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 };
const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
uint8_t * const _data_indexVector_112 = _data_indexVector + 12;
const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x]));
double * _data_pdfsf9cc34cc4e2b6261 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir];
double * _data_pdfs_10_2011ac6bf6446d4afa = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir;
_data_pdfsf9cc34cc4e2b6261[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_2011ac6bf6446d4afa[_stride_pdfs_0*x];
}
}
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef __CUDACC__
#pragma pop
#endif
void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream )
{
auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
auto pointer = indexVectors->pointerGpu(type);
int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
if( indexVectorSize == 0)
return;
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID);
WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()));
double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride());
dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
dim3 _grid(int(( (indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1));
internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
}
void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::ALL, stream );
}
void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::INNER, stream );
}
void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream )
{
run( block, IndexVectors::OUTER, stream );
}
} // namespace lbm
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file UniformGridGPU_UBB.h
//! \\author pystencils
//======================================================================================================================
#include "core/DataTypes.h"
#include "cuda/GPUField.h"
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "blockforest/StructuredBlockForest.h"
#include "field/FlagField.h"
#include <set>
#include <vector>
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
namespace walberla {
namespace lbm {
class UniformGridGPU_UBB
{
public:
struct IndexInfo {
int32_t x;
int32_t y;
int32_t z;
int32_t dir;
IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
bool operator==(const IndexInfo & o) const {
return x == o.x && y == o.y && z == o.z && dir == o.dir;
}
};
class IndexVectors
{
public:
using CpuIndexVector = std::vector<IndexInfo>;
enum Type {
ALL = 0,
INNER = 1,
OUTER = 2,
NUM_TYPES = 3
};
IndexVectors() : cpuVectors_(NUM_TYPES) {}
bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
~IndexVectors() {
for( auto & gpuVec: gpuVectors_)
cudaFree( gpuVec );
}
CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; }
void syncGPU()
{
gpuVectors_.resize( cpuVectors_.size() );
for(size_t i=0; i < size_t(NUM_TYPES); ++i )
{
auto & gpuVec = gpuVectors_[i];
auto & cpuVec = cpuVectors_[i];
cudaFree( gpuVec );
cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() );
cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice );
}
}
private:
std::vector<CpuIndexVector> cpuVectors_;
using GpuIndexVector = IndexInfo *;
std::vector<GpuIndexVector> gpuVectors_;
};
UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks,
BlockDataID pdfsID_ )
: pdfsID(pdfsID_)
{
auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB");
};
void operator() ( IBlock * block , cudaStream_t stream = 0 );
void inner( IBlock * block , cudaStream_t stream = 0 );
void outer( IBlock * block , cudaStream_t stream = 0 );
template<typename FlagField_T>
void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID)
{
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
}
template<typename FlagField_T>
void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
FlagUID boundaryFlagUID, FlagUID domainFlagUID )
{
auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
auto * flagField = block->getData< FlagField_T > ( flagFieldID );
auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
auto domainFlag = flagField->getFlag(domainFlagUID);
auto inner = flagField->xyzSize();
inner.expand( cell_idx_t(-1) );
indexVectorAll.clear();
indexVectorInner.clear();
indexVectorOuter.clear();
for( auto it = flagField->begin(); it != flagField->end(); ++it )
{
if( ! isFlagSet(it, domainFlag) )
continue;
if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 0 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 1 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 2 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 3 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 4 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 5 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 6 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 7 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 8 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 9 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 10 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 11 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 12 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 13 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 14 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 15 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 16 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 17 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
{
auto element = IndexInfo(it.x(), it.y(), it.z(), 18 );
indexVectorAll.push_back( element );
if( inner.contains( it.x(), it.y(), it.z() ) )
indexVectorInner.push_back( element );
else
indexVectorOuter.push_back( element );
}
}
indexVectors->syncGPU();
}
private:
void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 );
BlockDataID indexVectorID;
BlockDataID pdfsID;
};
} // namespace lbm
} // namespace walberla
\ No newline at end of file
......@@ -245,6 +245,23 @@ endfunction ( waLBerla_add_executable )
#######################################################################################################################
#
# Function to tell CMake which C/C++/CUDA files are generated by a python file
#
# Example:
# waLBerla_python_file_generates(MyPythonCodeGenScript.py Sweep1.cpp Sweep1.h Sweep2.h Sweep2.cu)
#
#
#######################################################################################################################
function( waLBerla_python_file_generates pythonFile )
get_filename_component(pythonFileAbsolutePath ${pythonFile} ABSOLUTE)
set( "WALBERLA_CODEGEN_INFO_${pythonFileAbsolutePath}" ${ARGN}
CACHE INTERNAL "Files generated by python script ${pythonFile}" FORCE)
endfunction(waLBerla_python_file_generates)
#######################################################################################################################
#
# Adds a waLBerla module test executable.
......
......@@ -40,11 +40,10 @@ function( handle_python_codegen sourceFilesOut generatedSourceFilesOut generator
if( ${sourceFile} MATCHES ".*\\.py$" )
set(codeGenRequired YES)
if( WALBERLA_BUILD_WITH_CODEGEN)
execute_process(COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} -l
OUTPUT_VARIABLE generatedSourceFiles)
string(REGEX REPLACE "\n$" "" generatedSourceFiles "${generatedSourceFiles}")
get_filename_component(pythonFileAbsolutePath ${sourceFile} ABSOLUTE )
set( generatedSourceFiles ${WALBERLA_CODEGEN_INFO_${pythonFileAbsolutePath}} )
set(generatedWithAbsolutePath )
set( generatedWithAbsolutePath )
foreach( filename ${generatedSourceFiles} )
list(APPEND generatedWithAbsolutePath ${CMAKE_CURRENT_BINARY_DIR}/${filename})
endforeach()
......@@ -52,9 +51,19 @@ function( handle_python_codegen sourceFilesOut generatedSourceFilesOut generator
list(APPEND generatedResult ${generatedWithAbsolutePath} )
list(APPEND generatorsResult ${sourceFile} )
string (REPLACE ";" "\", \"" jsonFileList "${generatedWithAbsolutePath}" )
set(pythonParameters
"{\"EXPECTED_FILES\": [\"${jsonFileList}\"], \"CMAKE_VARS\" : { "
"\"WALBERLA_OPTIMIZE_FOR_LOCALHOST\": \"${WALBERLA_OPTIMIZE_FOR_LOCALHOST}\","
"\"WALBERLA_DOUBLE_ACCURACY\": \"${WALBERLA_DOUBLE_ACCURACY}\","
"\"WALBERLA_BUILD_WITH_MPI\": \"${WALBERLA_BUILD_WITH_MPI}\","
"\"WALBERLA_BUILD_WITH_OPENMP\": \"${WALBERLA_BUILD_WITH_OPENMP}\" } }"
)
string(REPLACE "\"" "\\\"" pythonParameters ${pythonParameters}) # even one more quoting level required
string(REPLACE "\n" "" pythonParameters ${pythonParameters}) # remove newline characters
add_custom_command(OUTPUT ${generatedWithAbsolutePath}
DEPENDS ${sourceFile}
COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} -g
COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} ${pythonParameters}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CMAKE_CURRENT_BINARY_DIR})
endif()
......
......@@ -19,6 +19,10 @@ waLBerla_execute_test( NAME SimpleKernelTest )
waLBerla_compile_test( FILES FieldIndexing3DTest.cpp FieldIndexing3DTest.cu )
waLBerla_execute_test( NAME FieldIndexing3DTest )
waLBerla_python_file_generates(codegen/CudaJacobiKernel.py
CudaJacobiKernel2D.cu CudaJacobiKernel2D.h
CudaJacobiKernel3D.cu CudaJacobiKernel3D.h)
waLBerla_compile_test( FILES codegen/CodegenJacobiGPU.cpp
codegen/CudaJacobiKernel.py
DEPENDS blockforest timeloop gui )
......@@ -34,8 +38,8 @@ waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop )
waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
waLBerla_python_file_generates(codegen/MicroBenchmarkGpuLbm.py
MicroBenchmarkStreamKernel.cu MicroBenchmarkStreamKernel.h
MicroBenchmarkCopyKernel.cu MicroBenchmarkCopyKernel.h)
waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
from pystencils_walberla.sweep import Sweep
import sympy as sp
import pystencils as ps
from pystencils_walberla import CodeGeneration, generate_sweep
def jacobi2D(sweep):
src = sweep.field("f1")
dst = sweep.temporary_field(src)
with CodeGeneration() as ctx:
h = sp.symbols("h")
dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * sweep.constant("h") ** 2)
# ----- Jacobi 2D - created by specifying weights in nested list --------------------------
src, dst = ps.fields("src, src_tmp: [2D]")
stencil = [[0, -1, 0],
[-1, 4, -1],
[0, -1, 0]]
assignments = ps.assignment_from_stencil(stencil, src, dst, normalization_factor=4 * h**2)
generate_sweep(ctx, 'CudaJacobiKernel2D', assignments, field_swaps=[(src, dst)], target="gpu")
# ----- Jacobi 3D - created by using kernel_decorator with assignments in '@=' format -----
src, dst = ps.fields("src, src_tmp: [3D]")
def jacobi3D(sweep):
src = sweep.field("f1")
dst = sweep.temporary_field(src)
@ps.kernel
def kernel_func():
dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] +
src[0, 1, 0] + src[0, -1, 0] +
src[0, 0, 1] + src[0, 0, -1]) / (6 * h ** 2)
dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + src[0, 1, 0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0, -1]) / \
(6 * sweep.constant("h") ** 2)
Sweep.generate('CudaJacobiKernel2D', jacobi2D, dim=2, target='gpu')
Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu')
generate_sweep(ctx, 'CudaJacobiKernel3D', kernel_func, field_swaps=[(src, dst)], target="gpu")
#include "core/Environment.h"
#include "python_coupling/CreateConfig.h"
#include "blockforest/Initialization.h"
#include "lbm/field/PdfField.h"
#include "lbm/field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "lbm/communication/PdfFieldPackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "lbm/sweeps/CellwiseSweep.h"
#include "domain_decomposition/SharedSweep.h"
#include "EquivalenceTest_LatticeModel.h"
#include "EquivalenceTest_GPUKernel.h"
#include "EquivalenceTest_GPUPackInfo.h"
using namespace walberla;
using NativeLatticeModel_T = lbm::D3Q19<lbm::collision_model::SRT, false>;
using GeneratedLatticeModel_T = lbm::EquivalenceTest_LatticeModel;
using Stencil_T = GeneratedLatticeModel_T::Stencil;
using CommunicationStencil_T = GeneratedLatticeModel_T::CommunicationStencil;
using NativePdfField_T = lbm::PdfField<NativeLatticeModel_T>;
using GeneratedPdfField_T = lbm::PdfField<GeneratedLatticeModel_T>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
using CpuCommScheme_T = blockforest::communication::UniformBufferedScheme<CommunicationStencil_T>;
using GpuCommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
template<typename PdfField_T>
void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfFieldId )
{
auto domainBB = blocks->getDomainCellBB();
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
{
auto pdfField = blockIt->getData<PdfField_T>( pdfFieldId );
Cell offset( 0, 0, 0 );
blocks->transformBlockLocalToGlobalCell( offset, *blockIt );
WALBERLA_FOR_ALL_CELLS_XYZ( pdfField,
auto globalX = real_c( offset[0] + x );
auto globalZ = real_c( offset[2] + z );
auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) ));
auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) ));
pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( real_t(0.05) * std::sin(xArg), 0,
real_t(0.05) * std::cos(zArg)));
);
}
}
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
auto config = *cfg;
auto parameters = config->getOneBlock( "Parameters" );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
// Boundary
BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
const FlagUID fluidFlagUID( "Fluid" );
geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID );
GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega );
// Part 1 : Native walberla
NativeLatticeModel_T nativeLatticeModel = NativeLatticeModel_T( lbm::collision_model::SRT( omega ));
BlockDataID pdfFieldNativeId = lbm::addPdfFieldToStorage( blocks, "pdfNative", nativeLatticeModel, field::fzyx );
initPdfField<NativePdfField_T >( blocks, pdfFieldNativeId );
CpuCommScheme_T nativeComm( blocks );
nativeComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< NativeLatticeModel_T > >( pdfFieldNativeId ) );
auto nativeSweep = lbm::makeCellwiseSweep< NativeLatticeModel_T , FlagField_T >( pdfFieldNativeId, flagFieldId, fluidFlagUID );
SweepTimeloop nativeTimeLoop( blocks->getBlockStorage(), timesteps );
nativeTimeLoop.add() << BeforeFunction( nativeComm, "communication" )
<< Sweep(makeSharedSweep(nativeSweep), "native stream collide" );
nativeTimeLoop.run();
// Part 2: Generated CPU Version
BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx );
initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId );
CpuCommScheme_T cpuComm( blocks );
cpuComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< GeneratedLatticeModel_T > >( pdfFieldGeneratedId ) );
SweepTimeloop cpuTimeLoop( blocks->getBlockStorage(), timesteps );
cpuTimeLoop.add() << BeforeFunction( cpuComm, "communication" )
<< Sweep(GeneratedLatticeModel_T::Sweep( pdfFieldGeneratedId ), "generated stream collide on cpu" );
cpuTimeLoop.run();
// Part 3: Generated GPU Version
bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
bool oldCommunication = parameters.getParameter<bool>( "oldCommunication", false );
BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx );
initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU );
BlockDataID pdfGpuFieldId = cuda::addGPUFieldToStorage<GeneratedPdfField_T >( blocks, pdfShadowCPU, "pdfs on gpu", true );
auto defaultKernelStream = overlapCommunication ? cuda::StreamRAII::newStream() : cuda::StreamRAII::defaultStream();
auto innerKernelStartedEvent = make_shared<cuda::EventRAII>();
pystencils::EquivalenceTest_GPUKernel cudaLbKernel( pdfGpuFieldId, omega, defaultKernelStream );
GpuCommScheme_T gpuComm( blocks, innerKernelStartedEvent, cudaEnabledMPI );
gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId ));
auto runCommunication = [&]() { gpuComm(); };
CpuCommScheme_T oldGpuScheme( blocks );
std::vector<cudaStream_t > streams;
for(uint_t i=0; i < Stencil_T::Size; ++i ) {
cudaStream_t s;
cudaStreamCreate(&s);
streams.push_back(s);
}
using OldPackInfo = cuda::communication::GPUPackInfo<cuda::GPUField<real_t> >;
oldGpuScheme.addPackInfo( make_shared<OldPackInfo>(pdfGpuFieldId, streams) );
SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps );
if( !overlapCommunication )
{
gpuTimeLoop.add() << (oldCommunication ? BeforeFunction(oldGpuScheme) :
BeforeFunction( runCommunication, "gpu communication" ))
<< Sweep( cudaLbKernel, "LB stream & collide gpu" );
}
else
{
gpuTimeLoop.add() << Sweep( [&]( IBlock *b )
{
cudaEventRecord( *innerKernelStartedEvent, defaultKernelStream );
cudaLbKernel.inner( b );
}, "LBM @ inner" );
gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" )
<< Sweep( [&]( IBlock *b ) { cudaLbKernel.outer( b ); }, "LBM @ outer" );
}
gpuTimeLoop.run();
cuda::fieldCpy<GeneratedPdfField_T, cuda::GPUField<real_t>> (blocks, pdfShadowCPU, pdfGpuFieldId);
// Compare all three versions
auto errorCPU = real_t(0);
auto errorGPU = real_t(0);
for( auto & block : *blocks )
{
auto native = block.getData<NativePdfField_T>( pdfFieldNativeId );
auto cpu = block.getData<GeneratedPdfField_T >( pdfFieldGeneratedId );
auto gpu = block.getData<GeneratedPdfField_T>( pdfShadowCPU );
WALBERLA_FOR_ALL_CELLS_XYZ(native,
for(cell_idx_t f = 0; f < cell_idx_c(NativeLatticeModel_T::Stencil::Q); ++f )
{
errorCPU += std::abs( native->get( x, y, z, f ) - cpu->get( x, y, z, f ));
errorGPU += std::abs( native->get( x, y, z, f ) - gpu->get( x, y, z, f ));
}
)
}
mpi::reduceInplace(errorCPU, mpi::SUM);
mpi::reduceInplace(errorGPU, mpi::SUM);
auto domainBB = blocks->getDomainCellBB();
errorCPU /= real_c(domainBB.numCells());
errorGPU /= real_c(domainBB.numCells());
WALBERLA_LOG_RESULT_ON_ROOT("CPU Error " << errorCPU );
WALBERLA_LOG_RESULT_ON_ROOT("GPU Error " << errorGPU );
WALBERLA_CHECK_FLOAT_EQUAL(errorCPU, real_c(0.0));
WALBERLA_CHECK_FLOAT_EQUAL(errorGPU, real_c(0.0));
}
return 0;
}
\ No newline at end of file
import sympy as sp
from lbmpy_walberla import generate_lattice_model_files
from lbmpy.creationfunctions import create_lb_update_rule
from pystencils_walberla.sweep import Sweep
dtype = 'float64'
# LB options
options = {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': sp.Symbol("omega"),
'field_name': 'pdfs',
'compressible': False,
'maxwellian_moments': False,
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': True,
'cse_pdfs': True,
'double_precision': dtype == 'float64'}
}
# GPU optimization options
opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype}
def lb_assignments():
ur = create_lb_update_rule(**options)
return ur.all_assignments
generate_lattice_model_files(class_name='EquivalenceTest_LatticeModel', **options)
Sweep.generate_inner_outer_kernel('EquivalenceTest_GPUKernel',
lambda: create_lb_update_rule(**options).all_assignments,
target='gpu',
temporary_fields=['pdfs_tmp'],
field_swaps=[('pdfs', 'pdfs_tmp')],
optimization=opt,
outer_optimization=outer_opt)
Sweep.generate_pack_info('EquivalenceTest_GPUPackInfo', lb_assignments, target='gpu')
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment