diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt index b3f9795f7fc5d78b17e70fc5c80722647f852156..3204db7a4a97cefbcae23a8ef8fffd8792f76602 100644 --- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt @@ -10,10 +10,9 @@ waLBerla_python_file_generates(UniformGridGenerated.py GenDefines.h) -foreach(config trt ) +foreach(config trt smagorinsky mrt3 mrt entropic_kbc_n4 cumulant ) waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config} FILES UniformGridGenerated.cpp UniformGridGenerated.py DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui CODEGEN_CFG ${config}) endforeach() - diff --git a/apps/benchmarks/UniformGridGenerated/ManualKernels.h b/apps/benchmarks/UniformGridGenerated/ManualKernels.h index 72bbf8fbff4ede57fbdcaa28d1fccca0c21f6c5f..efc7b59d77066541f546415f67c452dc12b15c73 100644 --- a/apps/benchmarks/UniformGridGenerated/ManualKernels.h +++ b/apps/benchmarks/UniformGridGenerated/ManualKernels.h @@ -57,4 +57,127 @@ private: + +template<typename LM> +class StreamPullCollideD3Q19 +{ +public: + StreamPullCollideD3Q19(BlockDataID src, BlockDataID dst, real_t omega) + : src_(src), dst_(dst), omega_(omega) + {} + + void operator()(IBlock * block) + { + using PdfField_T = GhostLayerField<real_t, LM::Stencil::Q>; + auto src = block->getData<PdfField_T>( src_ ); + auto dst = block->getData<PdfField_T>( dst_ ); + using namespace stencil; + using Stencil = typename LM::Stencil; + + real_t omega_trm_ = real_t(1) - omega_; + real_t omega_w0_ = real_t(3) * ( real_t(1) / real_t( 3) ) * omega_; + real_t omega_w1_ = real_t(3) * ( real_t(1) / real_t(18) ) * omega_; + real_t omega_w2_ = real_t(3) * ( real_t(1) / real_t(36) ) * omega_; + + WALBERLA_FOR_ALL_CELLS_XYZ(src, { + + const real_t dd_tmp_NE = src->get(x-1, y-1, z , Stencil::idx[NE]); + const real_t dd_tmp_N = src->get(x , y-1, z , Stencil::idx[N]); + const real_t dd_tmp_NW = src->get(x+1, y-1, z , Stencil::idx[NW]); + const real_t dd_tmp_W = src->get(x+1, y , z , Stencil::idx[W]); + const real_t dd_tmp_SW = src->get(x+1, y+1, z , Stencil::idx[SW]); + const real_t dd_tmp_S = src->get(x , y+1, z , Stencil::idx[S]); + const real_t dd_tmp_SE = src->get(x-1, y+1, z , Stencil::idx[SE]); + const real_t dd_tmp_E = src->get(x-1, y , z , Stencil::idx[E]); + const real_t dd_tmp_T = src->get(x , y , z-1, Stencil::idx[T]); + const real_t dd_tmp_TE = src->get(x-1, y , z-1, Stencil::idx[TE]); + const real_t dd_tmp_TN = src->get(x , y-1, z-1, Stencil::idx[TN]); + const real_t dd_tmp_TW = src->get(x+1, y , z-1, Stencil::idx[TW]); + const real_t dd_tmp_TS = src->get(x , y+1, z-1, Stencil::idx[TS]); + const real_t dd_tmp_B = src->get(x , y , z+1, Stencil::idx[B]); + const real_t dd_tmp_BE = src->get(x-1, y , z+1, Stencil::idx[BE]); + const real_t dd_tmp_BN = src->get(x , y-1, z+1, Stencil::idx[BN]); + const real_t dd_tmp_BW = src->get(x+1, y , z+1, Stencil::idx[BW]); + const real_t dd_tmp_BS = src->get(x , y+1, z+1, Stencil::idx[BS]); + const real_t dd_tmp_C = src->get(x , y , z , Stencil::idx[C]); + + const real_t velX_trm = dd_tmp_E + dd_tmp_NE + dd_tmp_SE + dd_tmp_TE + dd_tmp_BE; + const real_t velY_trm = dd_tmp_N + dd_tmp_NW + dd_tmp_TN + dd_tmp_BN; + const real_t velZ_trm = dd_tmp_T + dd_tmp_TS + dd_tmp_TW; + + const real_t rho = dd_tmp_C + dd_tmp_S + dd_tmp_W + dd_tmp_B + dd_tmp_SW + dd_tmp_BS + dd_tmp_BW + velX_trm + velY_trm + velZ_trm; + + const real_t velX = velX_trm - dd_tmp_W - dd_tmp_NW - dd_tmp_SW - dd_tmp_TW - dd_tmp_BW; + const real_t velY = velY_trm + dd_tmp_NE - dd_tmp_S - dd_tmp_SW - dd_tmp_SE - dd_tmp_TS - dd_tmp_BS; + const real_t velZ = velZ_trm + dd_tmp_TN + dd_tmp_TE - dd_tmp_B - dd_tmp_BN - dd_tmp_BS - dd_tmp_BW - dd_tmp_BE; + + const real_t velXX = velX * velX; + const real_t velYY = velY * velY; + const real_t velZZ = velZ * velZ; + + const real_t dir_indep_trm = ( real_t(1) / real_t(3) ) * rho - real_t(0.5) * ( velXX + velYY + velZZ ); + + dst->get(x,y,z,Stencil::idx[C]) = omega_trm_ * dd_tmp_C + omega_w0_ * dir_indep_trm; + + const real_t vel_trm_E_W = dir_indep_trm + real_t(1.5) * velXX; + const real_t vel_trm_N_S = dir_indep_trm + real_t(1.5) * velYY; + const real_t vel_trm_T_B = dir_indep_trm + real_t(1.5) * velZZ; + + dst->get(x,y,z,Stencil::idx[E]) = omega_trm_ * dd_tmp_E + omega_w1_ * ( vel_trm_E_W + velX ); + dst->get(x,y,z,Stencil::idx[W]) = omega_trm_ * dd_tmp_W + omega_w1_ * ( vel_trm_E_W - velX ); + dst->get(x,y,z,Stencil::idx[N]) = omega_trm_ * dd_tmp_N + omega_w1_ * ( vel_trm_N_S + velY ); + dst->get(x,y,z,Stencil::idx[S]) = omega_trm_ * dd_tmp_S + omega_w1_ * ( vel_trm_N_S - velY ); + dst->get(x,y,z,Stencil::idx[T]) = omega_trm_ * dd_tmp_T + omega_w1_ * ( vel_trm_T_B + velZ ); + dst->get(x,y,z,Stencil::idx[B]) = omega_trm_ * dd_tmp_B + omega_w1_ * ( vel_trm_T_B - velZ ); + + const real_t velXmY = velX - velY; + const real_t vel_trm_NW_SE = dir_indep_trm + real_t(1.5) * velXmY * velXmY; + + dst->get(x,y,z,Stencil::idx[NW]) = omega_trm_ * dd_tmp_NW + omega_w2_ * ( vel_trm_NW_SE - velXmY ); + dst->get(x,y,z,Stencil::idx[SE]) = omega_trm_ * dd_tmp_SE + omega_w2_ * ( vel_trm_NW_SE + velXmY ); + + const real_t velXpY = velX + velY; + const real_t vel_trm_NE_SW = dir_indep_trm + real_t(1.5) * velXpY * velXpY; + + dst->get(x,y,z,Stencil::idx[NE]) = omega_trm_ * dd_tmp_NE + omega_w2_ * ( vel_trm_NE_SW + velXpY ); + dst->get(x,y,z,Stencil::idx[SW]) = omega_trm_ * dd_tmp_SW + omega_w2_ * ( vel_trm_NE_SW - velXpY ); + + const real_t velXmZ = velX - velZ; + const real_t vel_trm_TW_BE = dir_indep_trm + real_t(1.5) * velXmZ * velXmZ; + + dst->get(x,y,z,Stencil::idx[TW]) = omega_trm_ * dd_tmp_TW + omega_w2_ * ( vel_trm_TW_BE - velXmZ ); + dst->get(x,y,z,Stencil::idx[BE]) = omega_trm_ * dd_tmp_BE + omega_w2_ * ( vel_trm_TW_BE + velXmZ ); + + const real_t velXpZ = velX + velZ; + const real_t vel_trm_TE_BW = dir_indep_trm + real_t(1.5) * velXpZ * velXpZ; + + dst->get(x,y,z,Stencil::idx[TE]) = omega_trm_ * dd_tmp_TE + omega_w2_ * ( vel_trm_TE_BW + velXpZ ); + dst->get(x,y,z,Stencil::idx[BW]) = omega_trm_ * dd_tmp_BW + omega_w2_ * ( vel_trm_TE_BW - velXpZ ); + + const real_t velYmZ = velY - velZ; + const real_t vel_trm_TS_BN = dir_indep_trm + real_t(1.5) * velYmZ * velYmZ; + + dst->get(x,y,z,Stencil::idx[TS]) = omega_trm_ * dd_tmp_TS + omega_w2_ * ( vel_trm_TS_BN - velYmZ ); + dst->get(x,y,z,Stencil::idx[BN]) = omega_trm_ * dd_tmp_BN + omega_w2_ * ( vel_trm_TS_BN + velYmZ ); + + const real_t velYpZ = velY + velZ; + const real_t vel_trm_TN_BS = dir_indep_trm + real_t(1.5) * velYpZ * velYpZ; + + dst->get(x,y,z,Stencil::idx[TN]) = omega_trm_ * dd_tmp_TN + omega_w2_ * ( vel_trm_TN_BS + velYpZ ); + dst->get(x,y,z,Stencil::idx[BS]) = omega_trm_ * dd_tmp_BS + omega_w2_ * ( vel_trm_TN_BS - velYpZ ); + + }); + src->swapDataPointers(*dst); + } + +private: + BlockDataID src_; + BlockDataID dst_; + real_t omega_; +}; + + + + + } // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm index 48fa743393d2ebc7dbb05ea1e080abb42351a3cb..702d06252c13908fe98f99757acf4d4a4e77f799 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm +++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm @@ -8,17 +8,17 @@ DomainSetup Parameters { - timesteps 1000; // time steps of one performance measurement + timesteps 200; // time steps of one performance measurement warmupSteps 1; // number of steps to run before measurement starts - outerIterations 1; // how many measurements to conduct + outerIterations 4; // how many measurements to conduct - vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled + vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled timeStepMode twoField; - //twoFieldKernelType manual_generic; - remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time + //twoFieldKernelType manualD3Q19; + remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time directComm 0; omega 1.8; - shearVelocityMagnitude 0.05; + shearVelocityMagnitude 0.02; useGui 0; } diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp index 13d261a42c835496336c323bb24111d7c5b9082f..bcb87d151db8a169f86757acdaf9d68486fe560e 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp @@ -110,12 +110,17 @@ int main( int argc, char **argv ) std::function<void(IBlock*)> twoFieldKernel; if( twoFieldKernelType == "generated") { twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega); - } else if (twoFieldKernelType == "manual_generic") { + } else if (twoFieldKernelType == "manualGeneric") { using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega); - } else if (twoFieldKernelType == "manual_d3q19") { - + } else if (twoFieldKernelType == "manualD3Q19") { + using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; + BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); + twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega); + } else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", " + "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\""); } using F = std::function<void()>; diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py index 399d29fd0456475c15d0e04acfab5046f43014b7..766fd20b5580de640352c34cbc2831bbf4e2a0cd 100644 --- a/apps/benchmarks/UniformGridGenerated/params.py +++ b/apps/benchmarks/UniformGridGenerated/params.py @@ -3,6 +3,7 @@ import os import operator import waLBerla as wlb from waLBerla.tools.sqlitedb import * +from waLBerla.tools.setup import block_decomposition from functools import reduce @@ -32,11 +33,30 @@ def calculate_time_steps(runtime, expected_mlups, domain_size): return int(time_steps_per_second * runtime) +def domain_decomposition_func_z(processes, threads, block_size): + return { + 'blocks': (1, 1, processes), + 'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads) + } + + +def domain_decomposition_func_full(processes, threads, block_size): + return { + 'blocks': block_decomposition(processes), + 'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads) + } + + class BenchmarkScenario: - def __init__(self, block_size=(256, 128, 128), direct_comm=True, time_step_mode='aa', db_file_name='uniform_grid_gen.sqlite'): + def __init__(self, block_size=(256, 128, 128), direct_comm=True, + time_step_mode='aa', two_field_kernel_type='generated', + domain_decomposition_func=domain_decomposition_func_z, + db_file_name='uniform_grid_gen.sqlite'): self.block_size = block_size self.direct_comm = direct_comm self.time_step_mode = time_step_mode + self.two_field_kernel_type = two_field_kernel_type + self.domain_decomposition_func = domain_decomposition_func self.threads = int(os.environ['OMP_NUM_THREADS']) self.processes = wlb.mpi.numProcesses() self.db_file_name = db_file_name @@ -48,8 +68,6 @@ class BenchmarkScenario: time_steps = max(10, time_steps) cfg = { 'DomainSetup': { - 'blocks': (1, 1, self.processes), - 'cellsPerBlock': (self.block_size[0], self.block_size[1], self.block_size[2] * self.threads), 'periodic': (1, 1, 1), }, 'Parameters': { @@ -60,9 +78,11 @@ class BenchmarkScenario: 'remainingTimeLoggerFrequency': 0, 'omega': 1.6, 'timeStepMode': self.time_step_mode, + 'twoFieldKernelType': self.two_field_kernel_type, 'directComm': self.direct_comm, } } + cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size)) return cfg @wlb.member_callback @@ -88,13 +108,38 @@ class BenchmarkScenario: storeSingle(result, "runs", self.db_file_name) -def benchmark(): +def single_node_benchmark(): scenarios = wlb.ScenarioManager() - for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 64, 64), (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]: + for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64), + (1024, 64, 32), (2048, 64, 16), + (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]: for direct_comm in (True, False): for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']: - sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode) - scenarios.add(sc) + if time_step_mode == 'twoField': + for kt in ['generated', 'manualGeneric', 'manualD3Q19']: + sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, + time_step_mode=time_step_mode, two_field_kernel_type=kt, + domain_decomposition_func=domain_decomposition_func_z + ) + scenarios.add(sc) + + else: + sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, + domain_decomposition_func=domain_decomposition_func_z, + time_step_mode=time_step_mode) + scenarios.add(sc) + + +def weak_scaling(): + scenarios = wlb.ScenarioManager() + for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64), + (1024, 64, 32), (2048, 64, 16), + (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]: + for direct_comm in (True, False): + sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, + domain_decomposition_func=domain_decomposition_func_full) + scenarios.add(sc) + -benchmark() +single_node_benchmark()