diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py index 571f3129ef00a3b4a2a258059b25263170cc73fb..f616ba407e10e644878df134f0f295aa75d8fcf6 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py @@ -1,9 +1,9 @@ from pystencils.field import fields -from lbmpy.advanced_streaming.utility import get_timesteps, Timestep +from lbmpy.advanced_streaming.utility import get_timesteps from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from lbmpy.stencils import get_stencil -from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule +from lbmpy.creationfunctions import create_lb_collision_rule from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 29755644d22f0542ac29b50e848f20dd6a2c9974..b3ef93bb589a84f807ac00464fa84cd141c1eb0c 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" ) waLBerla_link_files_to_builddir( "simulation_setup" ) -foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 - entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt - cumulant cumulant_d3q27 - srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt) - - waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} - FILE UniformGridGPU.py - CODEGEN_CFG ${config} - OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h - UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h - UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h - UniformGridGPU_UBB.cu UniformGridGPU_UBB.h - UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h - UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h - UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h - UniformGridGPU_Defines.h - ) - - - waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config} - FILES UniformGridGPU.cpp - DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config}) - set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) -endforeach () - - -foreach (config srt trt mrt smagorinsky entropic) - - waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config} - FILE UniformGridGPU_AA.py - CODEGEN_CFG ${config} - OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h - UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h - UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h - UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h - UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h - UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h - UniformGridGPU_AA_Defines.h - ) - - - waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config} - FILES UniformGridGPU_AA.cpp - DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config}) - set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) -endforeach () +foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} + foreach(stencil d3q27) # choose from {d3q19 d3q27} + foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky} + set(config ${stencil}_${streaming_pattern}_${collision_setup}) + waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} + FILE UniformGridGPU.py + CODEGEN_CFG ${config} + OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h + UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h + UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h + UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h + UniformGridGPU_UBB.cu UniformGridGPU_UBB.h + UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h + UniformGridGPU_InfoHeader.h + ) + + + waLBerla_add_executable(NAME UniformGridGPU_${config} + FILES UniformGridGPU.cpp + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config}) + set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) + endforeach () + endforeach() +endforeach() \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index a123cbf34d4470fc4daa8943956faa7c206e9ad6..bdcffe3ad3a8c1992d7746f4e40fd34fc9325c4a 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -1,361 +1,302 @@ +#include "blockforest/Initialization.h" + #include "core/Environment.h" #include "core/logging/Initialization.h" -#include "core/math/Random.h" -#include "python_coupling/CreateConfig.h" -#include "python_coupling/PythonCallback.h" -#include "python_coupling/DictWrapper.h" -#include "blockforest/Initialization.h" -#include "field/FlagField.h" -#include "field/AddToStorage.h" -#include "field/vtk/VTKWriter.h" -#include "field/communication/PackInfo.h" -#include "lbm/PerformanceLogger.h" -#include "blockforest/communication/UniformBufferedScheme.h" -#include "timeloop/all.h" -#include "core/math/Random.h" -#include "geometry/all.h" -#include "cuda/HostFieldAllocator.h" -#include "cuda/communication/GPUPackInfo.h" -#include "cuda/ParallelStreams.h" -#include "cuda/NVTX.h" -#include "core/timing/TimingPool.h" #include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" + #include "cuda/AddGPUFieldToStorage.h" -#include "cuda/communication/UniformGPUScheme.h" #include "cuda/DeviceSelectMPI.h" -#include "domain_decomposition/SharedSweep.h" -#include "gui/Gui.h" -#include "lbm/gui/Connection.h" - -#include "UniformGridGPU_LatticeModel.h" -#include "UniformGridGPU_LbKernel.h" -#include "UniformGridGPU_PackInfo.h" -#include "UniformGridGPU_UBB.h" -#include "UniformGridGPU_NoSlip.h" -#include "UniformGridGPU_Communication.h" -#include "UniformGridGPU_MacroSetter.h" -#include "UniformGridGPU_MacroGetter.h" -#include "UniformGridGPU_Defines.h" +#include "cuda/ParallelStreams.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "cuda/FieldCopy.h" +#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h" +#include "field/AddToStorage.h" +#include "field/FlagField.h" +#include "field/communication/PackInfo.h" +#include "field/vtk/VTKWriter.h" -using namespace walberla; +#include "geometry/InitBoundaryHandling.h" -using LatticeModel_T = lbm::UniformGridGPU_LatticeModel; +#include "lbm/inplace_streaming/TimestepTracker.h" -const auto Q = LatticeModel_T::Stencil::Q; +#include "python_coupling/CreateConfig.h" +#include "python_coupling/DictWrapper.h" +#include "python_coupling/PythonCallback.h" +#include "timeloop/SweepTimeloop.h" -using Stencil_T = LatticeModel_T::Stencil; -using CommunicationStencil_T = LatticeModel_T::CommunicationStencil; -using PdfField_T = GhostLayerField<real_t, Q>; -using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; -using VelocityField_T = GhostLayerField<real_t, 3>; -using flag_t = walberla::uint8_t; -using FlagField_T = FlagField<flag_t>; +#include "InitShearVelocity.h" +#include <cmath> -void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID, - const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) ) -{ - math::seedRandomGenerator(0); - auto halfZ = blocks->getDomainCellBB().zMax() / 2; - for( auto & block: *blocks) - { - auto velField = block.getData<VelocityField_T>( velFieldID ); - WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, - Cell globalCell; - blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); - real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); - velField->get(x, y, z, 1) = real_t(0); - velField->get(x, y, z, 2) = randomReal; - - if( globalCell[2] >= halfZ ) { - velField->get(x, y, z, 0) = xMagnitude; - } else { - velField->get(x, y, z, 0) = -xMagnitude; - } - ); - } -} +#include "UniformGridGPU_InfoHeader.h" +using namespace walberla; +using FlagField_T = FlagField<uint8_t>; -int main( int argc, char **argv ) +int main(int argc, char** argv) { - mpi::Environment env( argc, argv ); + mpi::Environment env(argc, argv); cuda::selectDeviceBasedOnMpiRank(); - for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { - WALBERLA_MPI_WORLD_BARRIER(); + WALBERLA_MPI_WORLD_BARRIER() + + WALBERLA_CUDA_CHECK(cudaPeekAtLastError()) + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// SETUP AND CONFIGURATION /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// auto config = *cfg; - logging::configureLogging( config ); - auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + logging::configureLogging(config); + auto blocks = blockforest::createUniformBlockGridFromConfig(config); - Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); + Vector3< uint_t > cellsPerBlock = + config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); // Reading parameters - auto parameters = config->getOneBlock( "Parameters" ); - const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); - const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); - const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); // Creating fields - BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx); - BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx); + BlockDataID pdfFieldCpuID = + field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx); + BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx); + + // Initialize velocity on cpu + if( initShearFlow ){ + WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow") + initShearVelocity(blocks, velFieldCpuID); + } + + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true); + // Velocity field is copied to the GPU + BlockDataID velFieldGpuID = + cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true); - if( timeStepStrategy != "kernelOnlyNoInit") + pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID); + + // Set up initial PDF values + for (auto& block : *blocks) + setterSweep(&block); + + Vector3< int > innerOuterSplit = + parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1)); + + for (uint_t i = 0; i < 3; ++i) { - if ( initShearFlow ) - { - WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); - initShearVelocity( blocks, velFieldCpuID ); - } - - pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID); - for( auto & block : *blocks ) - setterSweep( &block ); - - // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here - blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks); - initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) ); - initialComm(); + if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) + { + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") + } } - BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); - BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); + Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); + bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false); + Vector3< int32_t > gpuBlockSize = + parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1)); + int streamHighPriority = 0; + int streamLowPriority = 0; + WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority)) + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// LB SWEEPS AND BOUNDARY HANDLING /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + using LbSweep = lbm::UniformGridGPU_LbKernel; + using PackInfoEven = lbm::UniformGridGPU_PackInfoEven; + using PackInfoOdd = lbm::UniformGridGPU_PackInfoOdd; + using cuda::communication::UniformGPUScheme; + + LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell); + lbSweep.setOuterPriority(streamHighPriority); // Boundaries const FlagUID fluidFlagUID( "Fluid" ); + BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field"); auto boundariesConfig = config->getBlock( "Boundaries" ); - bool disableBoundaries = true; + bool boundaries = false; if( boundariesConfig ) { - disableBoundaries = false; - geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); - geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); + boundaries = true; + geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); + geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); } - lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); + noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); - ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID ); - noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); - - // Communication setup - bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); - Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); - const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline"); - CommunicationSchemeType communicationScheme; - if( communicationSchemeStr == "GPUPackInfo_Baseline") - communicationScheme = GPUPackInfo_Baseline; - else if (communicationSchemeStr == "GPUPackInfo_Streams") - communicationScheme = GPUPackInfo_Streams; - else if (communicationSchemeStr == "UniformGPUScheme_Baseline") - communicationScheme = UniformGPUScheme_Baseline; - else if (communicationSchemeStr == "UniformGPUScheme_Memcpy") - communicationScheme = UniformGPUScheme_Memcpy; - else if (communicationSchemeStr == "MPIDatatypes") - communicationScheme = MPIDatatypes; - else if (communicationSchemeStr == "MPIDatatypesFull") - communicationScheme = MPIDatatypesFull; - else { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme") - } + lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); + ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); - Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); - for(uint_t i=0; i< 3; ++i) - { - if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { - WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock"); - } - } + // Initial setup is the post-collision state of an even time step + auto tracker = make_shared< lbm::TimestepTracker >(0); - int streamHighPriority = 0; - int streamLowPriority = 0; - WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); - WALBERLA_CHECK(gpuBlockSize[2] == 1); - pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, - 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, - gpuBlockSize[0], gpuBlockSize[1], - Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); - lbKernel.setOuterPriority( streamHighPriority ); - UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > - gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); - - auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); - auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority ); - auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority ); - auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority ); - - uint_t currentTimeStep = 0; - - auto simpleOverlapTimeStep = [&] () - { - gpuComm.startCommunication(defaultStream); - for( auto &block: *blocks ) - lbKernel.inner( &block, defaultStream ); - gpuComm.wait(defaultStream); - for( auto &block: *blocks ) - lbKernel.outer( &block, defaultStream ); - }; + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - auto overlapTimeStep = [&]() - { - cuda::NvtxRange namedRange("timestep"); - auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream ); + UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI); + auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); + comm.addPackInfo(packInfo); - innerOuterSection.run([&]( auto innerStream ) - { - cuda::nameStream(innerStream, "inner stream"); - for( auto &block: *blocks ) - { - if(!disableBoundaries) - { - auto p = boundaryInnerStreams.parallelSection( innerStream ); - p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } ); - p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } ); - } - lbKernel.inner( &block, innerStream ); - } - }); + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - innerOuterSection.run([&]( auto outerStream ) - { - cuda::nameStream(outerStream, "outer stream"); - gpuComm( outerStream ); + auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority); - for( auto &block: *blocks ) - { - if(!disableBoundaries) - { - auto p = boundaryOuterStreams.parallelSection( outerStream ); - p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } ); - p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } ); - } - lbKernel.outer( &block, outerStream ); - } - }); - currentTimeStep += 1; + auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + noSlip.run(block, t, stream); + ubb.run(block, t, stream); }; + auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + noSlip.inner(block, t, stream); + ubb.inner(block, t, stream); + }; - auto boundaryStreams = cuda::ParallelStreams( streamHighPriority ); - auto normalTimeStep = [&]() - { - gpuComm(); - for( auto &block: *blocks ) - { - if(!disableBoundaries) - { - auto p = boundaryStreams.parallelSection( defaultStream ); - p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } ); - p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } ); - } - lbKernel( &block ); + auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + noSlip.outer(block, t, stream); + ubb.outer(block, t, stream); + }; + + auto simpleOverlapTimeStep = [&]() { + // Communicate post-collision values of previous timestep... + comm.startCommunication(defaultStream); + for (auto& block : *blocks){ + if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); + lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream); } + comm.wait(defaultStream); + for (auto& block : *blocks){ + if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); + lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream); + } + + tracker->advance(); }; - auto kernelOnlyFunc = [&] () - { - for( auto &block: *blocks ) - lbKernel( &block ); + auto normalTimeStep = [&]() { + comm.communicate(defaultStream); + for (auto& block : *blocks){ + if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); + lbSweep(&block, tracker->getCounterPlusOne(), defaultStream); + } + + tracker->advance(); }; - SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); + // With two-fields patterns, ghost layer cells act as constant stream-in boundaries; + // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries. + auto kernelOnlyFunc = [&]() { + tracker->advance(); + for (auto& block : *blocks) + lbSweep(&block, tracker->getCounter(), defaultStream); + }; - std::function<void()> timeStep; + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME LOOP SETUP /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + + const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); + std::function< void() > timeStep; if (timeStepStrategy == "noOverlap") - timeStep = std::function<void()>( normalTimeStep ); - else if (timeStepStrategy == "complexOverlap") - timeStep = std::function<void()>( overlapTimeStep ); + timeStep = std::function< void() >(normalTimeStep); else if (timeStepStrategy == "simpleOverlap") - timeStep = simpleOverlapTimeStep; - else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") { - WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!") - timeStep = kernelOnlyFunc; + timeStep = simpleOverlapTimeStep; + else if (timeStepStrategy == "kernelOnly") + { + WALBERLA_LOG_INFO_ON_ROOT( + "Running only compute kernel without boundary - this makes only sense for benchmarking!") + // Run initial communication once to provide any missing stream-in populations + comm.communicate(); + timeStep = kernelOnlyFunc; } - else { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"); + else + { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " + "'simpleOverlap', 'kernelOnly'") } - timeLoop.add() << BeforeFunction( timeStep ) - << Sweep( []( IBlock * ) {}, "time step" ); - - pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID ); + timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step"); // VTK - uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); - if( vtkWriteFrequency > 0 ) + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) { - auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", - "simulation_step", false, true, true, false, 0 ); - auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel"); + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel"); vtkOutput->addCellDataWriter(velWriter); - vtkOutput->addBeforeFunction( [&]() { - cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ); - for( auto & block : *blocks ) - getterSweep( &block ); + + vtkOutput->addBeforeFunction([&]() { + cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); }); - timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// BENCHMARK /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); - int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); - for(int i=0; i < warmupSteps; ++i ) + int warmupSteps = parameters.getParameter< int >("warmupSteps", 2); + int outerIterations = parameters.getParameter< int >("outerIterations", 1); + for (int i = 0; i < warmupSteps; ++i) timeLoop.singleStep(); - auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds - if (remainingTimeLoggerFrequency > 0) { - auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency ); - timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); - } - - bool useGui = parameters.getParameter<bool>( "useGui", false ); - if( useGui ) + double remainingTimeLoggerFrequency = + parameters.getParameter< double >("remainingTimeLoggerFrequency", -1.0); // in seconds + if (remainingTimeLoggerFrequency > 0) { - GUI gui( timeLoop, blocks, argc, argv); - lbm::connectToGui<LatticeModel_T>(gui); - gui.run(); + auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), + remainingTimeLoggerFrequency); + timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); } - else + + for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) { - for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) - { - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - cudaDeviceSynchronize(); - WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); - simTimer.start(); - timeLoop.run(); - cudaDeviceSynchronize(); - simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); - WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); - WALBERLA_ROOT_SECTION() - { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) - { - const char * storagePattern = "twofield"; - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); - pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); - pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern ); - pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); - pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); - // Call Python function to report results - pythonCallbackResults(); - } - } - } + WALBERLA_CUDA_CHECK(cudaPeekAtLastError()) + + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_CUDA_CHECK(cudaPeekAtLastError()) + WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") + auto time = simTimer.last(); + auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); + + auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess) + WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps)) + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults("results_callback"); + if (pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess); + pythonCallbackResults.data().exposeValue("stencil", infoStencil); + pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); + pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); + pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); + pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); + // Call Python function to report results + pythonCallbackResults(); + } + } } } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index 6f9670d1d6b3609b21be57fc7d026389229fb320..7d3c8e7f21e602128564f4ea8e22a119e7080989 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -1,19 +1,22 @@ import sympy as sp import numpy as np import pystencils as ps -from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule -from lbmpy.boundaries import NoSlip, UBB -from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor -from pystencils_walberla import generate_pack_info_from_kernel -from lbmpy_walberla import generate_lattice_model, generate_boundary -from pystencils_walberla import CodeGeneration, generate_sweep + from pystencils.data_types import TypedSymbol from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions -from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter + +from lbmpy.advanced_streaming import Timestep, is_inplace +from lbmpy.advanced_streaming.utility import streaming_patterns +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import create_lb_collision_rule +from lbmpy.macroscopic_value_kernels import macroscopic_values_setter +from lbmpy.stencils import get_stencil + +from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep +from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary omega = sp.symbols("omega") omega_free = sp.Symbol("omega_free") -omega_fill = sp.symbols("omega_:10") compile_time_block_size = False if compile_time_block_size: @@ -21,156 +24,158 @@ if compile_time_block_size: else: sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), TypedSymbol("cudaBlockSize1", np.int32), - 1) + TypedSymbol("cudaBlockSize2", np.int32)) -sweep_params = {'block_size': sweep_block_size} +gpu_indexing_params = {'block_size': sweep_block_size} options_dict = { 'srt': { 'method': 'srt', - 'stencil': 'D3Q19', 'relaxation_rate': omega, 'compressible': False, }, 'trt': { 'method': 'trt', - 'stencil': 'D3Q19', 'relaxation_rate': omega, }, 'mrt': { 'method': 'mrt', - 'stencil': 'D3Q19', - 'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235], + 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1], }, - 'mrt_full': { + 'mrt-overrelax': { 'method': 'mrt', - 'stencil': 'D3Q19', - 'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], - omega_fill[3], omega_fill[4], omega_fill[5]], + 'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1], }, - 'entropic': { - 'method': 'mrt', - 'stencil': 'D3Q19', + 'cumulant': { + 'method': 'cumulant', + 'relaxation_rate': omega, 'compressible': True, - 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free], - 'entropic': True, }, - 'entropic_kbc_n4': { - 'method': 'trt-kbc-n4', - 'stencil': 'D3Q27', + 'cumulant-overrelax': { + 'method': 'cumulant', + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], 'compressible': True, - 'relaxation_rates': [omega, omega_free], + }, + 'entropic': { + 'method': 'mrt', + 'compressible': True, + 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free], 'entropic': True, }, 'smagorinsky': { 'method': 'srt', - 'stencil': 'D3Q19', 'smagorinsky': True, 'relaxation_rate': omega, - }, - 'cumulant': { - 'method': 'cumulant', - 'stencil': 'D3Q19', - 'compressible': True, - 'relaxation_rate': omega, - }, + } } info_header = """ -#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; const char * infoStencil = "{stencil}"; -const char * infoConfigName = "{configName}"; +const char * infoStreamingPattern = "{streaming_pattern}"; +const char * infoCollisionSetup = "{collision_setup}"; const bool infoCseGlobal = {cse_global}; const bool infoCsePdfs = {cse_pdfs}; """ +# DEFAULTS +optimize = True with CodeGeneration() as ctx: - accessor = StreamPullTwoFieldsAccessor() - # accessor = StreamPushTwoFieldsAccessor() - assert not accessor.is_inplace, "This app does not work for inplace accessors" + config_tokens = ctx.config.split('_') + + assert len(config_tokens) >= 3 + stencil_str = config_tokens[0] + streaming_pattern = config_tokens[1] + collision_setup = config_tokens[2] + + if len(config_tokens) >= 4: + optimize = (config_tokens[3] != 'noopt') + + stencil = get_stencil(stencil_str) + assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}" + + options = options_dict[collision_setup] + + q = len(stencil) + dim = len(stencil[0]) + assert dim == 3, "This app supports only three-dimensional stencils" + pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx') common_options = { - 'field_name': 'pdfs', - 'temporary_field_name': 'pdfs_tmp', - 'kernel_type': accessor, - 'optimization': {'cse_global': True, - 'cse_pdfs': False} + 'stencil': stencil, + 'field_name': pdfs.name, + 'optimization': { + 'target': 'gpu', + 'cse_global': True, + 'cse_pdfs': False, + 'symbolic_field': pdfs, + 'field_layout': 'fzyx', + 'gpu_indexing_params': gpu_indexing_params, + } } - config_name = ctx.config - noopt = False - d3q27 = False - if config_name.endswith("_noopt"): - noopt = True - config_name = config_name[:-len("_noopt")] - if config_name.endswith("_d3q27"): - d3q27 = True - config_name = config_name[:-len("_d3q27")] - - options = options_dict[config_name] - options.update(common_options) - options = options.copy() - if noopt: - options['optimization']['cse_global'] = False - options['optimization']['cse_pdfs'] = False - if d3q27: - options['stencil'] = 'D3Q27' + options.update(common_options) - stencil_str = options['stencil'] - q = int(stencil_str[stencil_str.find('Q') + 1:]) - pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') - options['optimization']['symbolic_field'] = pdfs + if not is_inplace(streaming_pattern): + options['optimization']['symbolic_temporary_field'] = pdfs_tmp + field_swaps = [(pdfs, pdfs_tmp)] + else: + field_swaps = [] vp = [ - ('double', 'omega_0'), - ('double', 'omega_1'), - ('double', 'omega_2'), - ('double', 'omega_3'), - ('double', 'omega_4'), - ('double', 'omega_5'), - ('double', 'omega_6'), ('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), + ('int32_t', 'cudaBlockSize2') ] - lb_method = create_lb_method(**options) - update_rule = create_lb_update_rule(lb_method=lb_method, **options) - - if not noopt: - update_rule = insert_fast_divisions(update_rule) - update_rule = insert_fast_sqrts(update_rule) - - # CPU lattice model - required for macroscopic value computation, VTK output etc. - options_without_opt = options.copy() - del options_without_opt['optimization'] - generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, - **options_without_opt)) - - # gpu LB sweep & boundaries - generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, - field_swaps=[('pdfs', 'pdfs_tmp')], - inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, - varying_parameters=vp) - - generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') - generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') + + # LB Sweep + collision_rule = create_lb_collision_rule(**options) + + if optimize: + collision_rule = insert_fast_divisions(collision_rule) + collision_rule = insert_fast_sqrts(collision_rule) + + lb_method = collision_rule.method + + generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, streaming_pattern, + optimization=options['optimization'], + inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps) # getter & setter - setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=1.0) - getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=None) - generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments) - generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments) + setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector, + pdfs=pdfs, + streaming_pattern=streaming_pattern, + previous_timestep=Timestep.EVEN) + generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu') + + # Boundaries + noslip = NoSlip() + ubb = UBB((0.05, 0, 0)) + + generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name, + streaming_pattern=streaming_pattern, target='gpu') + generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name, + streaming_pattern=streaming_pattern, target='gpu') # communication - generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu') + generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs, + streaming_pattern=streaming_pattern, target='gpu', + always_generate_separate_classes=True) infoHeaderParams = { 'stencil': stencil_str, - 'q': q, - 'configName': ctx.config, + 'streaming_pattern': streaming_pattern, + 'collision_setup': collision_setup, 'cse_global': int(options['optimization']['cse_global']), 'cse_pdfs': int(options['optimization']['cse_pdfs']), } - ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams)) + + stencil_typedefs = {'Stencil_T': stencil, + 'CommunicationStencil_T': stencil} + field_typedefs = {'PdfField_T': pdfs, + 'VelocityField_T': velocity_field} + + # Info header containing correct template definitions for stencil and field + generate_info_header(ctx, 'UniformGridGPU_InfoHeader', + stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs, + additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp deleted file mode 100644 index dbda68b72a5f2990965e2ff2c1e6e759c6fe0d2e..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp +++ /dev/null @@ -1,294 +0,0 @@ -#include "core/Environment.h" -#include "core/logging/Initialization.h" -#include "python_coupling/CreateConfig.h" -#include "python_coupling/PythonCallback.h" -#include "python_coupling/DictWrapper.h" -#include "blockforest/Initialization.h" -#include "field/FlagField.h" -#include "field/AddToStorage.h" -#include "field/vtk/VTKWriter.h" -#include "field/communication/PackInfo.h" -#include "lbm/PerformanceLogger.h" -#include "blockforest/communication/UniformBufferedScheme.h" -#include "timeloop/all.h" -#include "geometry/all.h" -#include "cuda/HostFieldAllocator.h" -#include "cuda/communication/GPUPackInfo.h" -#include "cuda/ParallelStreams.h" -#include "core/timing/TimingPool.h" -#include "core/timing/RemainingTimeLogger.h" -#include "cuda/AddGPUFieldToStorage.h" -#include "cuda/communication/UniformGPUScheme.h" -#include "cuda/DeviceSelectMPI.h" -#include "domain_decomposition/SharedSweep.h" -#include "InitShearVelocity.h" -#include "gui/Gui.h" - -#ifdef WALBERLA_ENABLE_GUI -#include "lbm/gui/PdfFieldDisplayAdaptor.h" -#endif - - -#include "UniformGridGPU_AA_PackInfoPush.h" -#include "UniformGridGPU_AA_PackInfoPull.h" -#include "UniformGridGPU_AA_MacroSetter.h" -#include "UniformGridGPU_AA_MacroGetter.h" -#include "UniformGridGPU_AA_LbKernelEven.h" -#include "UniformGridGPU_AA_LbKernelOdd.h" -#include "UniformGridGPU_AA_Defines.h" - -#include <cmath> - -using namespace walberla; - -using CommunicationStencil_T = Stencil_T; -using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >; -using VelocityField_T = GhostLayerField< real_t, 3 >; - - -int main( int argc, char **argv ) -{ - mpi::Environment env( argc, argv ); - cuda::selectDeviceBasedOnMpiRank(); - - for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) - { - WALBERLA_MPI_WORLD_BARRIER(); - - WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); - - auto config = *cfg; - logging::configureLogging( config ); - auto blocks = blockforest::createUniformBlockGridFromConfig( config ); - - Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" ); - // Reading parameters - auto parameters = config->getOneBlock( "Parameters" ); - const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 )); - const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 )); - - // Creating fields - BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx ); - BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); - - WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); - initShearVelocity( blocks, velFieldCpuID ); - - pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID ); - pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID ); - - for ( auto &block : *blocks ) - setterSweep( &block ); - - BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); - - Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); - - for(uint_t i=0; i< 3; ++i) - { - if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { - WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock"); - } - } - - Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); - bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); - Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); - - int streamHighPriority = 0; - int streamLowPriority = 0; - WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority )); - WALBERLA_CHECK( gpuBlockSize[2] == 1 ); - - - using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven; - using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd; - using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull; - using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush; - using cuda::communication::UniformGPUScheme; - - KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell ); - KernelOdd kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell ); - - kernelEven.setOuterPriority( streamHighPriority ); - kernelOdd .setOuterPriority( streamHighPriority ); - - auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI ); - auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI ); - pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) ); - pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) ); - - - auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); - - auto setupPhase = [&]() { - for ( auto &block: *blocks ) - kernelEven( &block ); - - pullScheme->communicate(); - - for ( auto &block: *blocks ) - kernelOdd( &block ); - }; - - - auto tearDownPhase = [&]() { - pushScheme->communicate(); - cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID ); - for ( auto &block : *blocks ) - getterSweep( &block ); - }; - - - auto simpleOverlapTimeStep = [&]() - { - // Even - pushScheme->startCommunication( defaultStream ); - for ( auto &block: *blocks ) - kernelEven.inner( &block, defaultStream ); - pushScheme->wait( defaultStream ); - for ( auto &block: *blocks ) - kernelEven.outer( &block, defaultStream ); - - // Odd - pullScheme->startCommunication( defaultStream ); - for ( auto &block: *blocks ) - kernelOdd.inner( &block, defaultStream ); - pullScheme->wait( defaultStream ); - for ( auto &block: *blocks ) - kernelOdd.outer( &block, defaultStream ); - }; - - auto normalTimeStep = [&]() - { - pushScheme->communicate( defaultStream ); - for ( auto &block: *blocks ) - kernelEven( &block, defaultStream ); - - pullScheme->communicate( defaultStream ); - for ( auto &block: *blocks ) - kernelOdd( &block, defaultStream ); - }; - - auto kernelOnlyFunc = [&]() - { - for ( auto &block: *blocks ) - kernelEven( &block, defaultStream ); - for ( auto &block: *blocks ) - kernelOdd( &block, defaultStream ); - }; - - SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); - - const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" ); - std::function< void() > timeStep; - if ( timeStepStrategy == "noOverlap" ) - timeStep = std::function< void() >( normalTimeStep ); - else if ( timeStepStrategy == "simpleOverlap" ) - timeStep = simpleOverlapTimeStep; - else if ( timeStepStrategy == "kernelOnly" ) - { - WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" ) - timeStep = kernelOnlyFunc; - } - else - { - WALBERLA_ABORT_NO_DEBUG_INFO( - "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" ); - } - - timeLoop.add() << BeforeFunction( timeStep ) - << Sweep( []( IBlock * ) {}, "time step" ); - - - // VTK - uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 ); - if ( vtkWriteFrequency > 0 ) - { - auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", - "simulation_step", false, true, true, false, 0 ); - auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" ); - vtkOutput->addCellDataWriter( velWriter ); - vtkOutput->addBeforeFunction( [&]() - { - tearDownPhase(); - setupPhase(); - } ); - timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); - } - - int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 ); - int outerIterations = parameters.getParameter< int >( "outerIterations", 1 ); - setupPhase(); - for ( int i = 0; i < warmupSteps; ++i ) - timeLoop.singleStep(); - - double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds - if ( remainingTimeLoggerFrequency > 0 ) - { - auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), remainingTimeLoggerFrequency ); - timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); - } - - bool useGui = parameters.getParameter<bool>( "useGui", false ); - if( useGui ) - { -#ifdef WALBERLA_ENABLE_GUI - cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID ); - timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" ); - GUI gui( timeLoop, blocks, argc, argv); - gui.registerDisplayAdaptorCreator( - [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * { - if ( block.isDataOfType< PdfField_T >( blockDataID) ) - return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID ); - return nullptr; - }); - gui.run(); -#else - WALBERLA_ABORT_NO_DEBUG_INFO("Application was built without GUI. Set useGui to false or re-compile with GUI.") -#endif - } - else - { - for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) - { - WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); - - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - cudaDeviceSynchronize(); - WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); - WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); - simTimer.start(); - timeLoop.run(); - cudaDeviceSynchronize(); - simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); - WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); - WALBERLA_ROOT_SECTION() - { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) - { - const char * storagePattern = "aa"; - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); - pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); - pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern ); - pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); - pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); - // Call Python function to report results - pythonCallbackResults(); - } - } - } - } - } - - return 0; -} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py deleted file mode 100644 index c7e6341ae6d9a125e09427dec23f044e212709f8..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py +++ /dev/null @@ -1,128 +0,0 @@ -import sympy as sp -import numpy as np -import pystencils as ps -from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule -from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor -from pystencils_walberla import generate_pack_info_from_kernel -from pystencils_walberla import CodeGeneration, generate_sweep -from pystencils.data_types import TypedSymbol -from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions -from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter - -omega = sp.symbols("omega") -omega_free = sp.Symbol("omega_free") -compile_time_block_size = False - -if compile_time_block_size: - sweep_block_size = (128, 1, 1) -else: - sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), - TypedSymbol("cudaBlockSize1", np.int32), - 1) - -sweep_params = {'block_size': sweep_block_size} - -options_dict = { - 'srt': { - 'method': 'srt', - 'stencil': 'D3Q19', - 'relaxation_rate': omega, - 'compressible': False, - }, - 'trt': { - 'method': 'trt', - 'stencil': 'D3Q19', - 'relaxation_rate': omega, - }, - 'mrt': { - 'method': 'mrt', - 'stencil': 'D3Q19', - 'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1], - }, - 'entropic': { - 'method': 'mrt', - 'stencil': 'D3Q19', - 'compressible': True, - 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free], - 'entropic': True, - }, - 'smagorinsky': { - 'method': 'srt', - 'stencil': 'D3Q19', - 'smagorinsky': True, - 'relaxation_rate': omega, - } -} - - -info_header = """ -#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; -const char * infoStencil = "{stencil}"; -const char * infoConfigName = "{configName}"; -const bool infoCseGlobal = {cse_global}; -const bool infoCsePdfs = {cse_pdfs}; -""" - - -with CodeGeneration() as ctx: - accessors = { - 'Even': AAEvenTimeStepAccessor(), - 'Odd': AAOddTimeStepAccessor() - } - - common_options = { - 'field_name': 'pdfs', - 'optimization': {'cse_global': True, - 'cse_pdfs': False, - 'field_layout': 'fzyx', - } - } - options = options_dict.get(ctx.config, options_dict['srt']) - options.update(common_options) - - stencil_str = options['stencil'] - q = int(stencil_str[stencil_str.find('Q') + 1:]) - pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') - options['optimization']['symbolic_field'] = pdfs - - vp = [ - ('int32_t', 'cudaBlockSize0'), - ('int32_t', 'cudaBlockSize1') - ] - lb_method = create_lb_method(**options) - - # Kernels - options_without_opt = options.copy() - del options_without_opt['optimization'] - update_rules = {} - for name, accessor in accessors.items(): - update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options) - update_rule = insert_fast_divisions(update_rule) - update_rule = insert_fast_sqrts(update_rule) - update_rules[name] = update_rule - generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule, - inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, - varying_parameters=vp) - - # getter & setter - setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=1.0) - getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=None) - generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments) - generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments) - - # communication - generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], - kind='pull', target='gpu') - generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], - kind='push', target='gpu') - - infoHeaderParams = { - 'stencil': stencil_str, - 'q': q, - 'configName': ctx.config, - 'cse_global': int(options['optimization']['cse_global']), - 'cse_pdfs': int(options['optimization']['cse_pdfs']), - } - ctx.write_file("UniformGridGPU_AA_Defines.h", info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48fca7135ba30b8f546b421c1329e847e0b6d129 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp @@ -0,0 +1,328 @@ +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/math/Random.h" +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" +#include "python_coupling/DictWrapper.h" +#include "blockforest/Initialization.h" +#include "field/FlagField.h" +#include "field/AddToStorage.h" +#include "field/vtk/VTKWriter.h" +#include "field/communication/PackInfo.h" +#include "lbm/PerformanceLogger.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "timeloop/all.h" +#include "core/math/Random.h" +#include "geometry/all.h" +#include "cuda/HostFieldAllocator.h" +#include "cuda/communication/GPUPackInfo.h" +#include "cuda/ParallelStreams.h" +#include "cuda/NVTX.h" +#include "core/timing/TimingPool.h" +#include "core/timing/RemainingTimeLogger.h" +#include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "cuda/DeviceSelectMPI.h" +#include "domain_decomposition/SharedSweep.h" + +#include "UniformGridGPU_LatticeModel.h" +#include "UniformGridGPU_LbKernel.h" +#include "UniformGridGPU_PackInfo.h" +#include "UniformGridGPU_UBB.h" +#include "UniformGridGPU_NoSlip.h" +#include "UniformGridGPU_Communication.h" +#include "UniformGridGPU_MacroSetter.h" +#include "UniformGridGPU_MacroGetter.h" +#include "UniformGridGPU_Defines.h" + +#include "InitShearVelocity.h" + + +using namespace walberla; + +using LatticeModel_T = lbm::UniformGridGPU_LatticeModel; + +const auto Q = LatticeModel_T::Stencil::Q; + + +using Stencil_T = LatticeModel_T::Stencil; +using CommunicationStencil_T = LatticeModel_T::CommunicationStencil; +using PdfField_T = GhostLayerField<real_t, Q>; +using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; +using VelocityField_T = GhostLayerField<real_t, 3>; +using flag_t = walberla::uint8_t; +using FlagField_T = FlagField<flag_t>; + +int main( int argc, char **argv ) +{ + mpi::Environment env( argc, argv ); + cuda::selectDeviceBasedOnMpiRank(); + + for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + WALBERLA_MPI_WORLD_BARRIER(); + + auto config = *cfg; + logging::configureLogging( config ); + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); + // Reading parameters + auto parameters = config->getOneBlock( "Parameters" ); + const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); + const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); + const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); + const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); + + // Creating fields + BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx); + BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx); + + if( timeStepStrategy != "kernelOnlyNoInit") + { + if ( initShearFlow ) + { + WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); + initShearVelocity( blocks, velFieldCpuID ); + } + + pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID); + for( auto & block : *blocks ) + setterSweep( &block ); + + // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here + blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks); + initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) ); + initialComm(); + } + + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); + BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); + + + // Boundaries + const FlagUID fluidFlagUID( "Fluid" ); + auto boundariesConfig = config->getBlock( "Boundaries" ); + bool disableBoundaries = true; + if( boundariesConfig ) + { + disableBoundaries = false; + geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); + geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); + } + + lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); + lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); + + ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID ); + noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); + + // Communication setup + bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); + const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline"); + CommunicationSchemeType communicationScheme; + if( communicationSchemeStr == "GPUPackInfo_Baseline") + communicationScheme = GPUPackInfo_Baseline; + else if (communicationSchemeStr == "GPUPackInfo_Streams") + communicationScheme = GPUPackInfo_Streams; + else if (communicationSchemeStr == "UniformGPUScheme_Baseline") + communicationScheme = UniformGPUScheme_Baseline; + else if (communicationSchemeStr == "UniformGPUScheme_Memcpy") + communicationScheme = UniformGPUScheme_Memcpy; + else if (communicationSchemeStr == "MPIDatatypes") + communicationScheme = MPIDatatypes; + else if (communicationSchemeStr == "MPIDatatypesFull") + communicationScheme = MPIDatatypesFull; + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme") + } + + Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); + for(uint_t i=0; i< 3; ++i) + { + if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock"); + } + } + + int streamHighPriority = 0; + int streamLowPriority = 0; + WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); + WALBERLA_CHECK(gpuBlockSize[2] == 1); + pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, + 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, + gpuBlockSize[0], gpuBlockSize[1], + Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); + lbKernel.setOuterPriority( streamHighPriority ); + UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > + gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); + + auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); + auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority ); + auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority ); + auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority ); + + uint_t currentTimeStep = 0; + + auto simpleOverlapTimeStep = [&] () + { + gpuComm.startCommunication(defaultStream); + for( auto &block: *blocks ) + lbKernel.inner( &block, defaultStream ); + gpuComm.wait(defaultStream); + for( auto &block: *blocks ) + lbKernel.outer( &block, defaultStream ); + }; + + auto overlapTimeStep = [&]() + { + cuda::NvtxRange namedRange("timestep"); + auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream ); + + innerOuterSection.run([&]( auto innerStream ) + { + cuda::nameStream(innerStream, "inner stream"); + for( auto &block: *blocks ) + { + if(!disableBoundaries) + { + auto p = boundaryInnerStreams.parallelSection( innerStream ); + p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } ); + p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } ); + } + lbKernel.inner( &block, innerStream ); + } + }); + + innerOuterSection.run([&]( auto outerStream ) + { + cuda::nameStream(outerStream, "outer stream"); + gpuComm( outerStream ); + + for( auto &block: *blocks ) + { + if(!disableBoundaries) + { + auto p = boundaryOuterStreams.parallelSection( outerStream ); + p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } ); + p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } ); + } + lbKernel.outer( &block, outerStream ); + } + }); + currentTimeStep += 1; + }; + + + auto boundaryStreams = cuda::ParallelStreams( streamHighPriority ); + auto normalTimeStep = [&]() + { + gpuComm(); + for( auto &block: *blocks ) + { + if(!disableBoundaries) + { + auto p = boundaryStreams.parallelSection( defaultStream ); + p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } ); + p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } ); + } + lbKernel( &block ); + } + }; + + auto kernelOnlyFunc = [&] () + { + for( auto &block: *blocks ) + lbKernel( &block ); + }; + + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); + + std::function<void()> timeStep; + if (timeStepStrategy == "noOverlap") + timeStep = std::function<void()>( normalTimeStep ); + else if (timeStepStrategy == "complexOverlap") + timeStep = std::function<void()>( overlapTimeStep ); + else if (timeStepStrategy == "simpleOverlap") + timeStep = simpleOverlapTimeStep; + else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") { + WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!") + timeStep = kernelOnlyFunc; + } + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"); + } + + timeLoop.add() << BeforeFunction( timeStep ) + << Sweep( []( IBlock * ) {}, "time step" ); + + pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID ); + + // VTK + uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); + if( vtkWriteFrequency > 0 ) + { + auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0 ); + auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel"); + vtkOutput->addCellDataWriter(velWriter); + vtkOutput->addBeforeFunction( [&]() { + cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ); + for( auto & block : *blocks ) + getterSweep( &block ); + }); + timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + } + + + + int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); + int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); + for(int i=0; i < warmupSteps; ++i ) + timeLoop.singleStep(); + + auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds + if (remainingTimeLoggerFrequency > 0) { + auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency ); + timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); + } + + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + const char * storagePattern = "twofield"; + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); + pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); + pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern ); + pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); + pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); + // Call Python function to report results + pythonCallbackResults(); + } + } + } + + } + + return 0; +} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm similarity index 100% rename from apps/benchmarks/UniformGridGPU/UniformGridGPU.prm rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py new file mode 100644 index 0000000000000000000000000000000000000000..c861cb15578a7af152e382c0b7d4a607c52181f5 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py @@ -0,0 +1,175 @@ +import sympy as sp +import numpy as np +import pystencils as ps +from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor +from pystencils_walberla import generate_pack_info_from_kernel +from lbmpy_walberla import generate_lattice_model, generate_boundary +from pystencils_walberla import CodeGeneration, generate_sweep +from pystencils.data_types import TypedSymbol +from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions +from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter + +omega = sp.symbols("omega") +omega_free = sp.Symbol("omega_free") +omega_fill = sp.symbols("omega_:10") +compile_time_block_size = False + +if compile_time_block_size: + sweep_block_size = (128, 1, 1) +else: + sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + 1) + +sweep_params = {'block_size': sweep_block_size} + +options_dict = { + 'srt': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + 'compressible': False, + }, + 'trt': { + 'method': 'trt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + }, + 'mrt': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235], + }, + 'mrt_full': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], + omega_fill[3], omega_fill[4], omega_fill[5]], + }, + 'entropic': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'compressible': True, + 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free], + 'entropic': True, + }, + 'entropic_kbc_n4': { + 'method': 'trt-kbc-n4', + 'stencil': 'D3Q27', + 'compressible': True, + 'relaxation_rates': [omega, omega_free], + 'entropic': True, + }, + 'smagorinsky': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'smagorinsky': True, + 'relaxation_rate': omega, + }, + 'cumulant': { + 'method': 'cumulant', + 'stencil': 'D3Q19', + 'compressible': True, + 'relaxation_rate': omega, + }, +} + +info_header = """ +#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; +const char * infoStencil = "{stencil}"; +const char * infoConfigName = "{configName}"; +const bool infoCseGlobal = {cse_global}; +const bool infoCsePdfs = {cse_pdfs}; +""" + +with CodeGeneration() as ctx: + accessor = StreamPullTwoFieldsAccessor() + # accessor = StreamPushTwoFieldsAccessor() + assert not accessor.is_inplace, "This app does not work for inplace accessors" + + common_options = { + 'field_name': 'pdfs', + 'temporary_field_name': 'pdfs_tmp', + 'kernel_type': accessor, + 'optimization': {'cse_global': True, + 'cse_pdfs': False} + } + config_name = ctx.config + noopt = False + d3q27 = False + if config_name.endswith("_noopt"): + noopt = True + config_name = config_name[:-len("_noopt")] + if config_name.endswith("_d3q27"): + d3q27 = True + config_name = config_name[:-len("_d3q27")] + + options = options_dict[config_name] + options.update(common_options) + options = options.copy() + + if noopt: + options['optimization']['cse_global'] = False + options['optimization']['cse_pdfs'] = False + if d3q27: + options['stencil'] = 'D3Q27' + + stencil_str = options['stencil'] + q = int(stencil_str[stencil_str.find('Q') + 1:]) + pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') + options['optimization']['symbolic_field'] = pdfs + + vp = [ + ('double', 'omega_0'), + ('double', 'omega_1'), + ('double', 'omega_2'), + ('double', 'omega_3'), + ('double', 'omega_4'), + ('double', 'omega_5'), + ('double', 'omega_6'), + ('int32_t', 'cudaBlockSize0'), + ('int32_t', 'cudaBlockSize1'), + ] + lb_method = create_lb_method(**options) + update_rule = create_lb_update_rule(lb_method=lb_method, **options) + + if not noopt: + update_rule = insert_fast_divisions(update_rule) + update_rule = insert_fast_sqrts(update_rule) + + # CPU lattice model - required for macroscopic value computation, VTK output etc. + options_without_opt = options.copy() + del options_without_opt['optimization'] + generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, + **options_without_opt)) + + # gpu LB sweep & boundaries + generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, + field_swaps=[('pdfs', 'pdfs_tmp')], + inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, + varying_parameters=vp) + + generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') + generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') + + # getter & setter + setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=1.0) + getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=None) + generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments) + generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments) + + # communication + generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu') + + infoHeaderParams = { + 'stencil': stencil_str, + 'q': q, + 'configName': ctx.config, + 'cse_global': int(options['optimization']['cse_global']), + 'cse_pdfs': int(options['optimization']['cse_pdfs']), + } + ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h similarity index 100% rename from apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py new file mode 100755 index 0000000000000000000000000000000000000000..d13e155f7dd5903465d7a9274bf7ae8200148582 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication. +Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1 then run e.g. + ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py + +Look at the end of the file to select the benchmark to run +""" + +import os +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle +from copy import deepcopy +from functools import reduce +import operator +import sys +import sqlite3 + +# Number of time steps run for a workload of 128^3 per GPU +# if double as many cells are on the GPU, half as many time steps are run etc. +# increase this to get more reliable measurements +TIME_STEPS_FOR_128_BLOCK = 200 +DB_FILE = "gpu_benchmark.sqlite3" + +BASE_CONFIG = { + 'DomainSetup': { + 'cellsPerBlock': (256, 128, 128), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'omega': 1.8, + 'cudaEnabledMPI': False, + 'warmupSteps': 5, + 'outerIterations': 3, + } +} + + +def prod(seq): + return reduce(operator.mul, seq, 1) + + +def num_time_steps(block_size, time_steps_for_128_block=200): + cells = block_size[0] * block_size[1] * block_size[2] + time_steps = (128 ** 3 / cells) * time_steps_for_128_block + return int(time_steps) + + +def cuda_block_size_ok(block_size, regs_per_threads=168): + """Checks if a given CUDA block size does not exceed the SM register limit. + 168 registers per thread was obtained using cuobjdump on both SRT and Cumulant + kernels. You might want to validate that for your own kernels.""" + + return prod(block_size) * regs_per_threads < 64 * (2 ** 10) + + +def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): + """Checks if a single block of given size fits into GPU memory""" + return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem + + +class Scenario: + def __init__(self, cells_per_block=(256, 128, 128), db_file=DB_FILE, **kwargs): + self.db_file = db_file + self.config_dict = deepcopy(BASE_CONFIG) + self.config_dict['Parameters'].update(kwargs) + self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) + self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block + + @wlb.member_callback + def config(self): + from pprint import pformat + wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) + # Write out the configuration as text-based prm: + # print(toPrm(self.config_dict)) + return self.config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + # check multiple times e.g. may fail when multiple benchmark processes are running + for num_try in range(num_tries): + try: + checkAndUpdateSchema(result, "runs", self.db_file) + storeSingle(result, "runs", self.db_file) + break + except sqlite3.OperationalError as e: + wlb.log_warning("Sqlite DB writing failed: try {}/{} {}".format(num_try + 1, num_tries, str(e))) + + +# -------------------------------------- Functions trying different parameter sets ----------------------------------- + + +def overlap_benchmark(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running different communication overlap strategies") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), + (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), + (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] + + # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: + # no overlap + scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, + innerOuterSplit=(1, 1, 1))) + + # overlap + for overlap_strategy in ['simpleOverlap', 'complexOverlap']: + for inner_outer_split in inner_outer_splits: + scenario = Scenario(timeStepStrategy=overlap_strategy, + communicationScheme=comm_strategy, + innerOuterSplit=inner_outer_split, + timesteps=num_time_steps((256, 128, 128))) + scenarios.add(scenario) + + +def communication_compare(): + """Tests different communication strategies""" + wlb.log_info_on_root("Running benchmarks to compare communication strategies") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]: + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: + + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='noOverlap', + communicationScheme=comm_strategy, + timesteps=num_time_steps(block_size)) + scenarios.add(sc) + for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]: + # ensure that the inner part of the domain is still large enough + if 3 * inner_outer_split[0] > block_size[0]: + continue + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='simpleOverlap', + innerOuterSplit=inner_outer_split, + communicationScheme=comm_strategy, + timesteps=num_time_steps(block_size)) + scenarios.add(sc) + + +def single_gpu_benchmark(): + """Benchmarks only the LBM compute kernel""" + wlb.log_info_on_root("Running single GPU benchmarks") + wlb.log_info_on_root("") + + gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8)) + gpu_mem = gpu_mem_gb * (2 ** 30) + gpu_type = os.environ.get('GPU_TYPE') + + kwargs = {} + if gpu_type is not None: + kwargs['gpu_type'] = gpu_type + + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] + cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), + (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), + (32, 4, 1), (64, 4, 1), (128, 4, 1), + (32, 8, 1), (64, 8, 1), + (32, 16, 1)] + for block_size in block_sizes: + for cuda_block_size in cuda_blocks: + if not cuda_block_size_ok(cuda_block_size): + wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.") + continue + if not domain_block_size_ok(block_size, gpu_mem): + wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.") + continue + scenario = Scenario(cells_per_block=block_size, + gpuBlockSize=cuda_block_size, + timeStepStrategy='kernelOnly', + timesteps=num_time_steps(block_size), + **kwargs) + scenarios.add(scenario) + + +# -------------------------------------- Optional job script generation for PizDaint --------------------------------- + + +job_script_header = """ +#!/bin/bash -l +#SBATCH --job-name=scaling +#SBATCH --time=0:30:00 +#SBATCH --nodes={nodes} +#SBATCH -o out_scaling_{nodes}_%j.txt +#SBATCH -e err_scaling_{nodes}_%j.txt +#SBATCH --ntasks-per-core=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --account=d105 + +cd {folder} + +source ~/env.sh + +module load daint-gpu +module load craype-accel-nvidia60 +export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer +export CRAY_CUDA_MPS=1 # allow GPU sharing +export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export CRAY_CUDA_MPS=1 + +export MPICH_RANK_REORDER_METHOD=3 +export PMI_MMAP_SYNC_WAIT_TIME=300 + + +# grid_order -R -H -c 1,1,8 -g 16,16,8 + +ulimit -c 0 +""" + +job_script_exe_part = """ + +export WALBERLA_SCENARIO_IDX=0 +while srun -n {nodes} ./{app} {config} +do + ((WALBERLA_SCENARIO_IDX++)) +done +""" + +all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27', + 'UniformGridBenchmarkGPU_smagorinsky_d3q27', + 'UniformGridBenchmarkGPU_cumulant' + 'UniformGridBenchmarkGPU_cumulant_d3q27') + + +def generate_jobscripts(exe_names=all_executables): + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: + folder_name = "scaling_{:04d}".format(node_count) + os.makedirs(folder_name, exist_ok=True) + + # run grid_order + import subprocess + decomposition = block_decomposition(node_count) + decomposition_str = ",".join(str(e) for e in decomposition) + subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str]) + + job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name)) + for exe in exe_names: + job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, + config='../communication_compare.py') + + with open(os.path.join(folder_name, 'job.sh'), 'w') as f: + f.write(job_script) + + +if __name__ == '__main__': + print("Called without waLBerla - generating job scripts for PizDaint") + generate_jobscripts() +else: + wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE)) + # Select the benchmark you want to run + single_gpu_benchmark() + # benchmarks different CUDA block sizes and domain sizes and measures single + # GPU performance of compute kernel (no communication) + # communication_compare(): benchmarks different communication routines, with and without overlap + # overlap_benchmark(): benchmarks different communication overlap options diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm similarity index 89% rename from apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm rename to apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm index ef257af8bc013da08ad5d04765e786c44a756a5e..1b22fcaaf2fe65fff1f32f162c959bd012995b97 100644 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm +++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm @@ -8,6 +8,7 @@ DomainSetup Parameters { + initShearFlow False; cudaEnabledMPI False; // set to true, if MPI was compiled with CUDA outerIterations 3; // number of measurements to make timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap @@ -35,3 +36,10 @@ Parameters // GPUPackInfo_Baseline: old implementation based on communication mechanism for CPUs // GPUPackInfo_Streams: same as above but with CUDA streams } + +Boundaries { + Border { direction T; walldistance -1; flag UBB; } + Border { direction B; walldistance -1; flag NoSlip; } + Border { direction W; walldistance -1; flag NoSlip; } + Border { direction E; walldistance -1; flag NoSlip; } +} diff --git a/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods new file mode 100644 index 0000000000000000000000000000000000000000..99ce7d0a33ff710ea40be7735533adcbb06b91d1 Binary files /dev/null and b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods differ diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index 5fbf36a9488874e87318ca402c747378f5a15f46..b3e7ac03396100609e509149f35ab7179712f945 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -2,7 +2,7 @@ """ This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication. Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1 then run e.g. - ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py + ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py Look at the end of the file to select the benchmark to run """ @@ -11,14 +11,15 @@ import os import waLBerla as wlb from waLBerla.tools.config import block_decomposition from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle -from copy import deepcopy +from functools import reduce +import operator import sys import sqlite3 # Number of time steps run for a workload of 128^3 per GPU # if double as many cells are on the GPU, half as many time steps are run etc. # increase this to get more reliable measurements -TIME_STEPS_FOR_128_BLOCK = 200 +TIME_STEPS_FOR_128_BLOCK = 500 DB_FILE = "gpu_benchmark.sqlite3" BASE_CONFIG = { @@ -35,26 +36,82 @@ BASE_CONFIG = { } -def num_time_steps(block_size): +def prod(seq): + return reduce(operator.mul, seq, 1) + + +def num_time_steps(block_size, time_steps_for_128_block=200): cells = block_size[0] * block_size[1] * block_size[2] - time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK + time_steps = (128 ** 3 / cells) * time_steps_for_128_block return int(time_steps) +def cuda_block_size_ok(block_size, regs_per_threads=168): + """Checks if a given CUDA block size does not exceed the SM register limit. + 168 registers per thread was obtained using cuobjdump on both SRT and Cumulant + kernels. You might want to validate that for your own kernels.""" + + return prod(block_size) * regs_per_threads < 64 * (2 ** 10) + + +def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): + """Checks if a single block of given size fits into GPU memory""" + return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem + + class Scenario: - def __init__(self, cells_per_block=(256, 128, 128), **kwargs): - self.config_dict = deepcopy(BASE_CONFIG) - self.config_dict['Parameters'].update(kwargs) - self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) - self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block + def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1), + timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False, + inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False, + additional_info=None): + + self.blocks = block_decomposition(wlb.mpi.numProcesses()) + + self.cells_per_block = cells_per_block + self.periodic = periodic + + self.time_step_strategy = time_step_strategy + self.omega = omega + self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block) + self.cuda_enabled_mpi = cuda_enabled_mpi + self.inner_outer_split = inner_outer_split + self.init_shear_flow = init_shear_flow + self.warmup_steps = warmup_steps + self.outer_iterations = outer_iterations + self.cuda_blocks = cuda_blocks + + self.vtk_write_frequency = 0 + + self.config_dict = self.config(print_dict=False) + self.additional_info = additional_info @wlb.member_callback - def config(self, **kwargs): + def config(self, print_dict=True): from pprint import pformat - wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) - # Write out the configuration as text-based prm: - # print(toPrm(self.config_dict)) - return self.config_dict + config_dict = { + 'DomainSetup': { + 'blocks': self.blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic, + }, + 'Parameters': { + 'omega': self.omega, + 'cudaEnabledMPI': self.cuda_enabled_mpi, + 'warmupSteps': self.warmup_steps, + 'outerIterations': self.outer_iterations, + 'timeStepStrategy': self.time_step_strategy, + 'timesteps': self.timesteps, + 'initShearFlow': self.init_shear_flow, + 'gpuBlockSize': self.cuda_blocks, + 'innerOuterSplit': self.inner_outer_split, + 'vtkWriteFrequency': self.vtk_write_frequency + } + } + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + if self.additional_info: + wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info)) + return config_dict @wlb.member_callback def results_callback(self, **kwargs): @@ -62,6 +119,10 @@ class Scenario: data.update(self.config_dict['Parameters']) data.update(self.config_dict['DomainSetup']) data.update(kwargs) + + if self.additional_info is not None: + data.update(self.additional_info) + data['executable'] = sys.argv[0] data['compile_flags'] = wlb.build_info.compiler_flags data['walberla_version'] = wlb.build_info.version @@ -78,7 +139,22 @@ class Scenario: storeSingle(result, "runs", DB_FILE) break except sqlite3.OperationalError as e: - wlb.log_warning("Sqlite DB writing failed: try {}/{} {}".format(num_try + 1, num_tries, str(e))) + wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") + + +# -------------------------------------- Profiling ----------------------------------- +def profiling(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running 2 timesteps for profiling") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + cells = (128, 128, 128) + cuda_enabled_mpi = False + + scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly', + inner_outer_split=(1, 1, 1), timesteps=2, cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1, warmup_steps=0)) # -------------------------------------- Functions trying different parameter sets ----------------------------------- @@ -90,52 +166,21 @@ def overlap_benchmark(): wlb.log_info_on_root("") scenarios = wlb.ScenarioManager() + cuda_enabled_mpi = False inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] - # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' - for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: - # no overlap - scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, - innerOuterSplit=(1, 1, 1))) - - # overlap - for overlap_strategy in ['simpleOverlap', 'complexOverlap']: - for inner_outer_split in inner_outer_splits: - scenario = Scenario(timeStepStrategy=overlap_strategy, - communicationScheme=comm_strategy, - innerOuterSplit=inner_outer_split, - timesteps=num_time_steps((256, 128, 128))) - scenarios.add(scenario) - - -def communication_compare(): - """Tests different communication strategies""" - wlb.log_info_on_root("Running benchmarks to compare communication strategies") - wlb.log_info_on_root("") + # no overlap + scenarios.add(Scenario(time_step_strategy='noOverlap', + inner_outer_split=(1, 1, 1), + cuda_enabled_mpi=cuda_enabled_mpi)) - scenarios = wlb.ScenarioManager() - for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]: - for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: - - sc = Scenario(cells_per_block=block_size, - gpuBlockSize=(128, 1, 1), - timeStepStrategy='noOverlap', - communicationScheme=comm_strategy, - timesteps=num_time_steps(block_size)) - scenarios.add(sc) - for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]: - # ensure that the inner part of the domain is still large enough - if 3 * inner_outer_split[0] > block_size[0]: - continue - sc = Scenario(cells_per_block=block_size, - gpuBlockSize=(128, 1, 1), - timeStepStrategy='simpleOverlap', - innerOuterSplit=inner_outer_split, - communicationScheme=comm_strategy, - timesteps=num_time_steps(block_size)) - scenarios.add(sc) + for inner_outer_split in inner_outer_splits: + scenario = Scenario(time_step_strategy='simpleOverlap', + inner_outer_split=inner_outer_split, + cuda_enabled_mpi=cuda_enabled_mpi) + scenarios.add(scenario) def single_gpu_benchmark(): @@ -143,8 +188,16 @@ def single_gpu_benchmark(): wlb.log_info_on_root("Running single GPU benchmarks") wlb.log_info_on_root("") + gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8)) + gpu_mem = gpu_mem_gb * (2 ** 30) + gpu_type = os.environ.get('GPU_TYPE') + + additional_info = {} + if gpu_type is not None: + additional_info['gpu_type'] = gpu_type + scenarios = wlb.ScenarioManager() - block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] + block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)] cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), (32, 4, 1), (64, 4, 1), (128, 4, 1), @@ -152,10 +205,17 @@ def single_gpu_benchmark(): (32, 16, 1)] for block_size in block_sizes: for cuda_block_size in cuda_blocks: + if not cuda_block_size_ok(cuda_block_size): + wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.") + continue + if not domain_block_size_ok(block_size, gpu_mem): + wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.") + continue scenario = Scenario(cells_per_block=block_size, - gpuBlockSize=cuda_block_size, - timeStepStrategy='kernelOnly', - timesteps=num_time_steps(block_size)) + cuda_blocks=cuda_block_size, + time_step_strategy='kernelOnly', + timesteps=num_time_steps(block_size), + additional_info=additional_info) scenarios.add(scenario) @@ -207,7 +267,6 @@ do done """ - all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27', 'UniformGridBenchmarkGPU_smagorinsky_d3q27', 'UniformGridBenchmarkGPU_cumulant' @@ -238,10 +297,9 @@ if __name__ == '__main__': print("Called without waLBerla - generating job scripts for PizDaint") generate_jobscripts() else: - wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE)) + wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") # Select the benchmark you want to run - single_gpu_benchmark() - # benchmarks different CUDA block sizes and domain sizes and measures single - # GPU performance of compute kernel (no communication) - # communication_compare(): benchmarks different communication routines, with and without overlap - # overlap_benchmark(): benchmarks different communication overlap options + single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU + # performance of compute kernel (no communication) + # overlap_benchmark() # benchmarks different communication overlap options + # profiling() # run only two timesteps on a smaller domain for profiling only diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py index 0bdf6e9923fe4fb106ab2975109a7c31a7f0d633..cda1ddbec3a6f98a9ea75d96037866c5018875b8 100644 --- a/python/pystencils_walberla/utility.py +++ b/python/pystencils_walberla/utility.py @@ -11,7 +11,8 @@ def generate_info_header(ctx: CodeGenerationContext, field_typedefs: dict = None, additional_headers: set = None, headers_to_ignore: set = None, - additional_typedefs: dict = None): + additional_typedefs: dict = None, + additional_code: str = ""): """Generates an info header, consolidating required information about the generated code. The info header #includes all generated header files, and is thus the only header the application needs to #include. It can also contain aliases for waLBerla stencil types and @@ -24,7 +25,8 @@ def generate_info_header(ctx: CodeGenerationContext, field_typedefs: dict mapping type names to pystencils `Field` instances additional_headers: additional header files to be included headers_to_ignore: headers which should not be included - additional_typedefs: dict mapping aliases to types. + additional_typedefs: dict mapping aliases to types + additional_code: additional code which gets appended on the file """ stencil_typedefs = stencil_typedefs if stencil_typedefs is not None else dict() field_typedefs = field_typedefs if field_typedefs is not None else dict() @@ -50,7 +52,7 @@ def generate_info_header(ctx: CodeGenerationContext, if path.splitext(filename)[1] not in HEADER_EXTENSIONS: filename += '.h' - ctx.write_file(filename, lines) + ctx.write_file(filename, lines + additional_code) # ------------------------------------- INTERNAL ------------------------------------------------------------- diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index b701f144f972b8eb496849d260968f21306542eb..46b22c43a0056afc407c98341384e8d22b5147f8 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -4,7 +4,7 @@ # ################################################################################################### -waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil +waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil lbm BUILD_ONLY_IF_FOUND CUDA ) ################################################################################################### \ No newline at end of file diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..dc7c7d2c0fbd9f1d152793bcbe84d577dc708ed9 --- /dev/null +++ b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h @@ -0,0 +1,86 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CombinedInPlaceGpuPackInfo.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== + +#pragma once + +#define IS_EVEN(x) ((x & 1) ^ 1) + +#include "cuda/communication/GeneratedGPUPackInfo.h" + +#include "lbm/inplace_streaming/TimestepTracker.h" + +namespace walberla { +namespace lbm { + +template< typename EvenPackInfo, typename OddPackInfo > +class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo +{ + public: + template< typename... Args > + CombinedInPlaceGpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args) + : tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...) + {} + + virtual ~CombinedInPlaceGpuPackInfo() = default; + + void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override + { + if (IS_EVEN(tracker_->getCounter())) + { + evenPackInfo_.pack(dir, buffer, block, stream); + } + else + { + oddPackInfo_.pack(dir, buffer, block, stream); + } + } + + void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override { + if (IS_EVEN(tracker_->getCounter())) + { + evenPackInfo_.unpack(dir, buffer, block, stream); + } + else + { + oddPackInfo_.unpack(dir, buffer, block, stream); + } + } + + uint_t size(stencil::Direction dir, IBlock* block) override { + if (IS_EVEN(tracker_->getCounter())) + { + return evenPackInfo_.size(dir, block); + } + else + { + return oddPackInfo_.size(dir, block); + } + } + + private: + const std::shared_ptr< lbm::TimestepTracker >& tracker_; + EvenPackInfo evenPackInfo_; + OddPackInfo oddPackInfo_; +}; + +} // namespace lbm +} // namespace walberla + + diff --git a/src/field/Field.h b/src/field/Field.h index a47fe47e7314011fe0bae2df37ca090facabb321..a01fc76cddc32539c7e482e0159a249f1189c54f 100644 --- a/src/field/Field.h +++ b/src/field/Field.h @@ -253,10 +253,10 @@ namespace field { inline Layout layout() const { return layout_; } - cell_idx_t xStride() const { return xfact_; } - cell_idx_t yStride() const { return yfact_; } - cell_idx_t zStride() const { return zfact_; } - cell_idx_t fStride() const { return ffact_; } + int64_t xStride() const { return xfact_; } + int64_t yStride() const { return yfact_; } + int64_t zStride() const { return zfact_; } + int64_t fStride() const { return ffact_; } cell_idx_t xOff() const { return xOff_; } cell_idx_t yOff() const { return yOff_; } @@ -369,10 +369,10 @@ namespace field { Layout layout_; //!< Determines in which order the values are stored uint_t allocSize_; //!< The overall size of the T* (padding included) - cell_idx_t ffact_; //!< Access multiplication factor for the f-dimension. - cell_idx_t xfact_; //!< Access multiplication factor for the x-dimension. - cell_idx_t yfact_; //!< Access multiplication factor for the y-dimension. - cell_idx_t zfact_; //!< Access multiplication factor for the z-dimension. + int64_t ffact_; //!< Access multiplication factor for the f-dimension. + int64_t xfact_; //!< Access multiplication factor for the x-dimension. + int64_t yfact_; //!< Access multiplication factor for the y-dimension. + int64_t zfact_; //!< Access multiplication factor for the z-dimension. shared_ptr<FieldAllocator<T> > allocator_; //!< Allocator for the field diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h index 20ee14bb95a2df3b50975f42c7ce96fb047c1dcf..c5fec45c77194c58cf10f2e0ba633740d54d89eb 100644 --- a/src/field/Field.impl.h +++ b/src/field/Field.impl.h @@ -322,14 +322,14 @@ namespace field { const uint_t alignment = 64; #elif defined(__ARM_NEON) const uint_t alignment = 16; +#elif defined(__BIGGEST_ALIGNMENT__) + const uint_t alignment = __BIGGEST_ALIGNMENT__; #elif defined(__AVX512F__) const uint_t alignment = 64; #elif defined(__AVX__) const uint_t alignment = 32; #elif defined(__SSE__) || defined(_MSC_VER) const uint_t alignment = 16; -#elif defined(__BIGGEST_ALIGNMENT__) - const uint_t alignment = __BIGGEST_ALIGNMENT__; #else const uint_t alignment = 64; #endif @@ -361,24 +361,24 @@ namespace field { fAllocSize_ = fSize_; WALBERLA_CHECK_LESS_EQUAL( fSize_ * xAllocSize_ * yAllocSize_ * zAllocSize_ + xSize_ + ySize_ * xAllocSize_ + zSize_ * xAllocSize_ * yAllocSize_, - std::numeric_limits< cell_idx_t >::max(), - "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." ); + std::numeric_limits< int64_t >::max(), + "The data type 'int64_t' is too small for your field size! Your field is too large." ); - ffact_ = cell_idx_c(xAllocSize_ * yAllocSize_ * zAllocSize_); - zfact_ = cell_idx_c(xAllocSize_ * yAllocSize_); - yfact_ = cell_idx_c(xAllocSize_); + ffact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_) * int64_t(zAllocSize_); + zfact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_); + yfact_ = int64_t(xAllocSize_); xfact_ = 1; } else { values_ = allocator_->allocate(zSize_, ySize_, xSize_, fSize_, yAllocSize_, xAllocSize_, fAllocSize_); zAllocSize_ = zSize_; WALBERLA_CHECK_LESS_EQUAL( fSize_ + xSize_ * fAllocSize_ + ySize_ * fAllocSize_ * xAllocSize_ + zSize_ * fAllocSize_ * xAllocSize_ * yAllocSize_, - std::numeric_limits< cell_idx_t >::max(), - "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." ); + std::numeric_limits< int64_t >::max(), + "The data type 'int64_t' is too small for your field size! Your field is too large." ); - zfact_ = cell_idx_c(fAllocSize_ * xAllocSize_ * yAllocSize_); - yfact_ = cell_idx_c(fAllocSize_ * xAllocSize_); - xfact_ = cell_idx_c(fAllocSize_); + zfact_ = int64_t (fAllocSize_) * int64_t(xAllocSize_) * int64_t(yAllocSize_); + yfact_ = int64_t(fAllocSize_) * int64_t(xAllocSize_); + xfact_ = int64_t (fAllocSize_); ffact_ = 1; } @@ -721,7 +721,7 @@ namespace field { { assertValidCoordinates( x, y, z, f ); - const cell_idx_t index = f*ffact_+ x*xfact_+ y*yfact_+ z*zfact_; + const int64_t index = f*int64_t(ffact_) + int64_t(x)*int64_t(xfact_) + int64_t(y)*int64_t(yfact_) + int64_t(z)*int64_t(zfact_); WALBERLA_ASSERT_LESS( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(allocSize_) ); WALBERLA_ASSERT_GREATER_EQUAL( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(0) ); @@ -1102,7 +1102,7 @@ namespace field { xSize_ = xs; ySize_ = ys; zSize_ = zs; - const auto offset = xOff_*xfact_+ yOff_*yfact_+ zOff_*zfact_; + const int64_t offset = int64_t(xOff_)*int64_t(xfact_) + int64_t(yOff_)*int64_t(yfact_) + int64_t(zOff_)*int64_t(zfact_); valuesWithOffset_ = values_ + offset; } diff --git a/src/lbm/all.h b/src/lbm/all.h index d4b0a4f1559eaa977cb17e43b41ef75f48323f7b..dc581a251d6f31ce10032c74ccbf2147a7f90780 100644 --- a/src/lbm/all.h +++ b/src/lbm/all.h @@ -33,6 +33,7 @@ #include "field/all.h" #include "geometry/all.h" #include "gui/all.h" +#include "inplace_streaming/all.h" #include "lattice_model/all.h" #include "refinement/all.h" #include "sweeps/all.h" diff --git a/src/lbm/inplace_streaming/TimestepTracker.h b/src/lbm/inplace_streaming/TimestepTracker.h index c0ca22635fea5d9a454f99e2d4fe08a519578eef..daba8db2380af5895f15c1f5688cf59f06b28f8f 100644 --- a/src/lbm/inplace_streaming/TimestepTracker.h +++ b/src/lbm/inplace_streaming/TimestepTracker.h @@ -29,10 +29,10 @@ namespace lbm class TimestepTracker { private: - uint8_t counter_; + uint8_t counter_{ 0 }; public: - TimestepTracker() : counter_(0) {} + TimestepTracker() = default; TimestepTracker(uint8_t zeroth_timestep) : counter_(zeroth_timestep & 1) {} void advance() { counter_ = (counter_ + 1) & 1; } @@ -43,6 +43,7 @@ class TimestepTracker } uint8_t getCounter() const { return counter_; } + uint8_t getCounterPlusOne() const { return (counter_ + 1) & 1; } }; // class TimestepTracker diff --git a/src/lbm/inplace_streaming/all.h b/src/lbm/inplace_streaming/all.h new file mode 100644 index 0000000000000000000000000000000000000000..b984e282387c2b39c07819ddeebc58c0bf9327fb --- /dev/null +++ b/src/lbm/inplace_streaming/all.h @@ -0,0 +1,23 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file all.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "TimestepTracker.h"