diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 74d5e5c714ff810f60a454f7062c230635e928c2..9372c0b5275d6057ea188cc5667883a5d8e30d30 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -1,8 +1,14 @@ waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_python_file_generates(UniformGridGPU.py + UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h + UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h + UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h + UniformGridGPU_UBB.cu UniformGridGPU_UBB.h + UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h + ) + waLBerla_add_executable ( NAME UniformGridBenchmarkGPU - FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp - UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu - UniformGridGPU_PackInfo.cu + FILES UniformGridGPU.cpp UniformGridGPU.py DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk ) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py deleted file mode 100644 index 731897463f30a8b9d8d005ec64f0dce36e954321..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py +++ /dev/null @@ -1,59 +0,0 @@ -import sympy as sp -from lbmpy_walberla import generate_lattice_model_files -from lbmpy.creationfunctions import create_lb_update_rule -from pystencils_walberla.sweep import Sweep -from lbmpy.boundaries import NoSlip, UBB -from lbmpy.creationfunctions import create_lb_method -from lbmpy_walberla.boundary import create_boundary_class -from pystencils_walberla.cmake_integration import codegen - - -dtype = 'float64' - -# LB options -options = { - 'method': 'srt', - 'stencil': 'D3Q19', - 'relaxation_rate': sp.Symbol("omega"), - 'field_name': 'pdfs', - 'compressible': False, - 'temporary_field_name': 'pdfs_tmp', - 'optimization': {'cse_global': True, - 'cse_pdfs': True, - 'double_precision': dtype == 'float64'} -} - -# GPU optimization options -inner_opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} -outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype} - - -def lb_assignments(): - ur = create_lb_update_rule(**options) - return ur.all_assignments - - -def genBoundary(): - boundary = UBB([0.05, 0, 0], dim=3, name="UniformGridGPU_UBB") - return create_boundary_class(boundary, create_lb_method(**options), target='gpu') - - -def genNoSlip(): - boundary = NoSlip(name='UniformGridGPU_NoSlip') - return create_boundary_class(boundary, create_lb_method(**options), target='gpu') - - -generate_lattice_model_files(class_name='UniformGridGPU_LatticeModel', **options) - -Sweep.generate_inner_outer_kernel('UniformGridGPU_LbKernel', - lambda: create_lb_update_rule(**options).all_assignments, - target='gpu', - temporary_fields=['pdfs_tmp'], - field_swaps=[('pdfs', 'pdfs_tmp')], - optimization=inner_opt, - outer_optimization=outer_opt) - -Sweep.generate_pack_info('UniformGridGPU_PackInfo', lb_assignments, target='gpu') - -codegen.register(['UniformGridGPU_UBB.h', 'UniformGridGPU_UBB.cu'], genBoundary) -codegen.register(['UniformGridGPU_NoSlip.h', 'UniformGridGPU_NoSlip.cu'], genNoSlip) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py new file mode 100644 index 0000000000000000000000000000000000000000..a4619226eabd2da6ce38bf233d67e91e5bf4ccaa --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -0,0 +1,35 @@ +import sympy as sp +from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule +from lbmpy.boundaries import NoSlip, UBB +from pystencils_walberla import generate_pack_info_from_kernel +from lbmpy_walberla import generate_lattice_model, generate_boundary +from pystencils_walberla import CodeGeneration, generate_sweep + + +with CodeGeneration() as ctx: + # LB options + options = { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': sp.Symbol("omega"), + 'field_name': 'pdfs', + 'compressible': False, + 'temporary_field_name': 'pdfs_tmp', + 'optimization': {'cse_global': True, + 'cse_pdfs': True, + 'gpu_indexing_params': {'block_size': (128, 1, 1)}} + } + lb_method = create_lb_method(**options) + update_rule = create_lb_update_rule(lb_method=lb_method, **options) + + # CPU lattice model - required for macroscopic value computation, VTK output etc. + generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method) + + # gpu LB sweep & boundaries + generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')], + inner_outer_split=True, target='gpu') + generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') + generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') + + # communication + generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu') diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm deleted file mode 100644 index c6b8ae931524c474bcc60bce1711e69f011f6a53..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm +++ /dev/null @@ -1,27 +0,0 @@ - -Parameters -{ - omega 1.8; - timesteps 2; - - remainingTimeLoggerFrequency 3; - vtkWriteFrequency 0; - - overlapCommunication false; - cudaEnabledMPI false; -} - -DomainSetup -{ - blocks < 1, 1, 1 >; - cellsPerBlock < 50, 20, 10 >; - periodic < 0, 0, 1 >; -} - -Boundaries -{ - Border { direction W; walldistance -1; flag NoSlip; } - Border { direction E; walldistance -1; flag NoSlip; } - Border { direction S; walldistance -1; flag NoSlip; } - Border { direction N; walldistance -1; flag UBB; } -} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp deleted file mode 100644 index 20712a5bc6e37826a12e6e8f1fb011a08df43a13..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp +++ /dev/null @@ -1,594 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\author Martin Bauer <martin.bauer@fau.de> -//====================================================================================================================== - -#include <cmath> - -#include "core/DataTypes.h" -#include "core/Macros.h" -#include "lbm/field/PdfField.h" -#include "lbm/sweeps/Streaming.h" -#include "UniformGridGPU_LatticeModel.h" - -#ifdef _MSC_VER -# pragma warning( disable : 4458 ) -#endif - -#define FUNC_PREFIX - -#ifdef WALBERLA_CXX_COMPILER_IS_GNU -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wshadow" -#endif - -#ifdef WALBERLA_CXX_COMPILER_IS_CLANG -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-parameter" -#pragma clang diagnostic ignored "-Wshadow" -#endif - - -using namespace std; - -namespace walberla { -namespace lbm { - -namespace internal_kernel_streamCollide { -static FUNC_PREFIX void kernel_streamCollide(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) -{ - const double xi_1 = omega*0.166666666666667; - const double xi_5 = omega*0.0416666666666667; - for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) - { - double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; - double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; - double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; - double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; - double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; - double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; - double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; - double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; - double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; - double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; - double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; - double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2; - double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; - double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; - double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) - { - double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; - double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; - double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; - double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; - double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; - double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; - double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; - double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; - double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; - double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; - double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; - double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; - double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; - double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; - double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; - double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; - double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; - double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; - double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; - double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_30; - double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_31; - double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_32; - double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_33; - double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_34; - double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_35; - double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_36; - double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_37; - double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_38; - double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_39; - double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_310; - double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_311; - double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_312; - double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_313; - double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_314; - double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_315; - double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_316; - double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_317; - double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_318; - for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) - { - const double xi_18 = -_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - const double xi_19 = -_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - const double xi_20 = -_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; - const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; - const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; - const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; - const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - const double xi_23 = (u_0*u_0); - const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; - const double xi_21 = -u_1; - const double xi_24 = (u_1*u_1); - const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - const double xi_22 = -u_2; - const double xi_25 = (u_2*u_2); - const double u0Mu1 = u_0 + xi_21; - const double u0Pu1 = u_0 + u_1; - const double u1Pu2 = u_1 + u_2; - const double u1Mu2 = u_1 + xi_22; - const double u0Mu2 = u_0 + xi_22; - const double u0Pu2 = u_0 + u_2; - const double f_eq_common = rho - xi_23 - xi_24 - xi_25; - const double xi_26 = f_eq_common + rho*-0.666666666666667; - const double xi_27 = f_eq_common + rho*-0.333333333333333; - const double xi_28 = xi_25 + xi_27; - const double xi_29 = xi_23 + xi_27; - const double xi_30 = xi_24 + xi_27; - const double xi_2 = xi_24*2 + xi_26; - const double xi_3 = xi_23*2 + xi_26; - const double xi_4 = xi_25*2 + xi_26; - const double xi_6 = u0Mu1*2; - const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; - const double xi_8 = u0Pu1*2; - const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; - const double xi_10 = u1Pu2*2; - const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; - const double xi_12 = u1Mu2*2; - const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; - const double xi_14 = u0Mu2*2; - const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; - const double xi_16 = u0Pu2*2; - const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; - _data_pdfs_tmp_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_31_10[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_32_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_33_10[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_34_10[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_35_10[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_36_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_37_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_38_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_39_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_310_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_311_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_312_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_313_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_314_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_315_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_316_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_317_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_318_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - } - } - } -} -} -namespace internal_kernel_collide { -static FUNC_PREFIX void kernel_collide(double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) -{ - const double xi_1 = omega*0.166666666666667; - const double xi_5 = omega*0.0416666666666667; - for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) - { - double * _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - double * _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - double * _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - double * _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - double * _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - double * _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - double * _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - double * _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - double * _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - double * _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - double * _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - double * _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - double * _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - double * _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - double * _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; - double * _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - double * _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - double * _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; - double * _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; - for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) - { - double * _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; - double * _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; - double * _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; - double * _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; - double * _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; - double * _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; - double * _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; - double * _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; - double * _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; - double * _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; - double * _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; - double * _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; - double * _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; - double * _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; - double * _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; - double * _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; - double * _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; - double * _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; - double * _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; - for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) - { - const double Dummy_18 = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]; - const double Dummy_19 = _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]; - const double Dummy_20 = _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]; - const double Dummy_21 = _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]; - const double Dummy_22 = _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]; - const double Dummy_23 = _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]; - const double Dummy_24 = _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]; - const double Dummy_25 = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]; - const double Dummy_26 = _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; - const double Dummy_27 = _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]; - const double Dummy_28 = _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]; - const double Dummy_29 = _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; - const double Dummy_30 = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]; - const double Dummy_31 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]; - const double Dummy_32 = _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; - const double Dummy_33 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]; - const double Dummy_34 = _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; - const double Dummy_35 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; - const double Dummy_36 = _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]; - const double xi_18 = -Dummy_22; - const double xi_19 = -Dummy_26; - const double xi_20 = -Dummy_24; - const double vel0Term = Dummy_20 + Dummy_21 + Dummy_30 + Dummy_31 + Dummy_34; - const double vel1Term = Dummy_19 + Dummy_23 + Dummy_25 + Dummy_29; - const double vel2Term = Dummy_18 + Dummy_28 + Dummy_32; - const double rho = Dummy_22 + Dummy_24 + Dummy_26 + Dummy_27 + Dummy_33 + Dummy_35 + Dummy_36 + vel0Term + vel1Term + vel2Term; - const double u_0 = -Dummy_28 - Dummy_29 - Dummy_33 + vel0Term + xi_18 + xi_19; - const double xi_23 = (u_0*u_0); - const double u_1 = -Dummy_18 - Dummy_27 - Dummy_30 + Dummy_34 + vel1Term + xi_19 + xi_20; - const double xi_21 = -u_1; - const double xi_24 = (u_1*u_1); - const double u_2 = Dummy_20 - Dummy_21 - Dummy_23 + Dummy_25 - Dummy_36 + vel2Term + xi_18 + xi_20; - const double xi_22 = -u_2; - const double xi_25 = (u_2*u_2); - const double u0Mu1 = u_0 + xi_21; - const double u0Pu1 = u_0 + u_1; - const double u1Pu2 = u_1 + u_2; - const double u1Mu2 = u_1 + xi_22; - const double u0Mu2 = u_0 + xi_22; - const double u0Pu2 = u_0 + u_2; - const double f_eq_common = rho - xi_23 - xi_24 - xi_25; - const double xi_26 = f_eq_common + rho*-0.666666666666667; - const double xi_27 = f_eq_common + rho*-0.333333333333333; - const double xi_28 = xi_25 + xi_27; - const double xi_29 = xi_23 + xi_27; - const double xi_30 = xi_24 + xi_27; - const double xi_2 = xi_24*2 + xi_26; - const double xi_3 = xi_23*2 + xi_26; - const double xi_4 = xi_25*2 + xi_26; - const double xi_6 = u0Mu1*2; - const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; - const double xi_8 = u0Pu1*2; - const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; - const double xi_10 = u1Pu2*2; - const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; - const double xi_12 = u1Mu2*2; - const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; - const double xi_14 = u0Mu2*2; - const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; - const double xi_16 = u0Pu2*2; - const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = Dummy_35 + omega*(-Dummy_35 + f_eq_common*0.333333333333333); - _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = Dummy_19 + xi_1*(Dummy_19*-6 + u_1 + xi_2); - _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = Dummy_27 + xi_1*(Dummy_27*-6 + xi_2 + xi_21); - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = Dummy_33 + xi_1*(Dummy_33*-6 - u_0 + xi_3); - _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = Dummy_31 + xi_1*(Dummy_31*-6 + u_0 + xi_3); - _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = Dummy_32 + xi_1*(Dummy_32*-6 + u_2 + xi_4); - _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = Dummy_36 + xi_1*(Dummy_36*-6 + xi_22 + xi_4); - _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = Dummy_29 + xi_5*(Dummy_29*-24 - xi_6 + xi_7); - _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = Dummy_34 + xi_5*(Dummy_34*-24 + xi_8 + xi_9); - _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = Dummy_26 + xi_5*(Dummy_26*-24 - xi_8 + xi_9); - _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = Dummy_30 + xi_5*(Dummy_30*-24 + xi_6 + xi_7); - _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = Dummy_25 + xi_5*(Dummy_25*-24 + xi_10 + xi_11); - _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = Dummy_18 + xi_5*(Dummy_18*-24 - xi_12 + xi_13); - _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = Dummy_28 + xi_5*(Dummy_28*-24 - xi_14 + xi_15); - _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = Dummy_20 + xi_5*(Dummy_20*-24 + xi_16 + xi_17); - _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = Dummy_23 + xi_5*(Dummy_23*-24 + xi_12 + xi_13); - _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = Dummy_24 + xi_5*(Dummy_24*-24 - xi_10 + xi_11); - _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = Dummy_22 + xi_5*(Dummy_22*-24 - xi_16 + xi_17); - _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = Dummy_21 + xi_5*(Dummy_21*-24 + xi_14 + xi_15); - } - } - } -} -} -namespace internal_kernel_stream { -static FUNC_PREFIX void kernel_stream(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) -{ - for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) - { - double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; - double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; - double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; - double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; - double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; - double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; - double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; - double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; - double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; - double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; - double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; - double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; - double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; - double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; - double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; - double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; - double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; - double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; - double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; - double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; - double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; - double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; - double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; - double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; - for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) - { - double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; - double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; - double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; - double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; - double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; - double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; - double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; - double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; - double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; - double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; - double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; - double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; - double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; - double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; - double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; - double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; - double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; - double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; - double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; - double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; - double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; - double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; - double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; - double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; - double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; - double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; - double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; - double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; - double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; - double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; - double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; - double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; - double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; - double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; - double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; - double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; - double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; - double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; - for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) - { - _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; - _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - } - } - } -} -} - - -const real_t UniformGridGPU_LatticeModel::w[19] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; -const real_t UniformGridGPU_LatticeModel::wInv[19] = { 3.00000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000 }; - -void UniformGridGPU_LatticeModel::Sweep::streamCollide( IBlock * block, const uint_t numberOfGhostLayersToInclude ) -{ - auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); - GhostLayerField<double, 19> * pdfs_tmp; - // Getting temporary field pdfs_tmp - auto it = cache_pdfs_.find( pdfs ); - if( it != cache_pdfs_.end() ) - { - pdfs_tmp = *it; - } - else - { - pdfs_tmp = pdfs->cloneUninitialized(); - cache_pdfs_.insert(pdfs_tmp); - } - - - auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel(); - lm.configureBlock(block); - - auto & omega = lm.omega; - WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - internal_kernel_streamCollide::kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); - pdfs->swapDataPointers(pdfs_tmp); - -} - -void UniformGridGPU_LatticeModel::Sweep::collide( IBlock * block, const uint_t numberOfGhostLayersToInclude ) -{ - auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); - - - auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel(); - lm.configureBlock(block); - - auto & omega = lm.omega; - WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - internal_kernel_collide::kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); -} - - -void UniformGridGPU_LatticeModel::Sweep::stream( IBlock * block, const uint_t numberOfGhostLayersToInclude ) -{ - auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); - GhostLayerField<double, 19> * pdfs_tmp; - // Getting temporary field pdfs_tmp - auto it = cache_pdfs_.find( pdfs ); - if( it != cache_pdfs_.end() ) - { - pdfs_tmp = *it; - } - else - { - pdfs_tmp = pdfs->cloneUninitialized(); - cache_pdfs_.insert(pdfs_tmp); - } - - - WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(pdfs->xSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(pdfs->ySize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(pdfs->zSize()) + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); - const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); - const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); - const int64_t _stride_pdfs_tmp_3 = int64_t(pdfs_tmp->fStride()); - internal_kernel_stream::kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); - - pdfs->swapDataPointers(pdfs_tmp); - -} - - -} // namespace lbm -} // namespace walberla - - - - -// Buffer Packing - -namespace walberla { -namespace mpi { - -mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm) -{ - buf << lm.currentLevel; - return buf; -} - -mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::lbm::UniformGridGPU_LatticeModel & lm) -{ - buf >> lm.currentLevel; - return buf; -} - - -} // namespace mpi -} // namespace walberla - -#ifdef WALBERLA_CXX_COMPILER_IS_GNU -#pragma GCC diagnostic pop -#endif - -#ifdef WALBERLA_CXX_COMPILER_IS_CLANG -#pragma clang diagnostic pop -#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h deleted file mode 100644 index 02a6c7cf869062c38e8ad513dc3ddf69bafb2158..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h +++ /dev/null @@ -1,746 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\author Martin Bauer <martin.bauer@fau.de> -// -//====================================================================================================================== - - -#include "core/DataTypes.h" -#include "core/logging/Logging.h" - -#include "field/GhostLayerField.h" -#include "field/SwapableCompare.h" -#include "domain_decomposition/BlockDataID.h" -#include "domain_decomposition/IBlock.h" -#include "stencil/D3Q19.h" - -#include "lbm/lattice_model/EquilibriumDistribution.h" -#include "lbm/field/Density.h" -#include "lbm/field/DensityAndMomentumDensity.h" -#include "lbm/field/DensityAndVelocity.h" -#include "lbm/field/PressureTensor.h" -#include "lbm/field/ShearRate.h" - -#include <set> - -#ifdef __GNUC__ -#define RESTRICT __restrict__ -#elif _MSC_VER -#define RESTRICT __restrict -#else -#define RESTRICT -#endif - -#ifdef WALBERLA_CXX_COMPILER_IS_GNU -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#endif - -#ifdef WALBERLA_CXX_COMPILER_IS_CLANG -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-parameter" -#endif - - - - -// Forward declarations -namespace walberla{ -namespace lbm { - class UniformGridGPU_LatticeModel; -}} -namespace walberla { -namespace mpi { - mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm); - mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::lbm::UniformGridGPU_LatticeModel & lm); -}} - - - - -namespace walberla { -namespace lbm { - - -/** -UniformGridGPU_LatticeModel was generated with lbmpy. Do not edit this file directly. Instead modify UniformGridGPU_LatticeModel.py. -For details see documentation of lbmpy. - -Usage: - - Create an instance of this lattice model class: the constructor parameters vary depending on the configure - lattice model. A model with constant force needs a single force vector, while a model with variable forces needs - a force field. All constructor parameters are ordered alphabetically. - - Create a PDFField with the lattice model as template argument to store the particle distribution functions. - Use the PDFField to get and modify macroscopic values. - - The internal class UniformGridGPU_LatticeModel::Sweep is a functor to execute one LB time step. - Stream, collide steps can be executed separately, or together in an optimized stream-pull-collide scheme - -*/ -class UniformGridGPU_LatticeModel -{ - -public: - typedef stencil::D3Q19 Stencil; - typedef stencil::D3Q19 CommunicationStencil; - static const real_t w[19]; - static const real_t wInv[19]; - - static const bool compressible = false; - static const int equilibriumAccuracyOrder = 2; - - class Sweep - { - public: - Sweep( BlockDataID _pdfsID ) : pdfsID(_pdfsID) {}; - - //void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); - void collide ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); - void streamCollide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); - void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); - - void operator() ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ) - { - streamCollide( block, numberOfGhostLayersToInclude ); - } - - private: - BlockDataID pdfsID; - - std::set< GhostLayerField<double, 19> *, field::SwapableCompare< GhostLayerField<double, 19> * > > cache_pdfs_; - }; - - UniformGridGPU_LatticeModel( double omega_ ) - : omega(omega_), currentLevel(0) - {}; - - void configure( IBlock & block, StructuredBlockStorage &) { configureBlock( &block ); } - -private: - void configureBlock(IBlock * block) - { - - - - } - - // Parameters: - double omega; - - // Updated by configureBlock: - - - uint_t currentLevel; - - // Backend classes can access private members: - friend class UniformGridGPU_LatticeModel::Sweep; - template<class LM, class Enable> friend class EquilibriumDistribution; - template<class LM, class Enable> friend struct Equilibrium; - template<class LM, class Enable> friend struct internal::AdaptVelocityToForce; - template<class LM, class Enable> friend struct Density; - template<class LM> friend struct DensityAndVelocity; - template<class LM, class Enable> friend struct DensityAndMomentumDensity; - template<class LM, class Enable> friend struct MomentumDensity; - template<class LM, class It, class Enable> friend struct DensityAndVelocityRange; - - friend mpi::SendBuffer & ::walberla::mpi::operator<< (mpi::SendBuffer & , const UniformGridGPU_LatticeModel & ); - friend mpi::RecvBuffer & ::walberla::mpi::operator>> (mpi::RecvBuffer & , UniformGridGPU_LatticeModel & ); - -}; - - - - -//====================================================================================================================== -// -// Implementation of macroscopic value backend -// -//====================================================================================================================== - - - -template<> -class EquilibriumDistribution< UniformGridGPU_LatticeModel, void> -{ -public: - typedef typename UniformGridGPU_LatticeModel::Stencil Stencil; - - static real_t get( const stencil::Direction direction, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), - real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - using namespace stencil; - switch( direction ) { - case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); - case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; - case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); - case W: return rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); - case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; - case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; - case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); - case NW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; - case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; - case SW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; - case SE: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); - case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; - case TS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - case TW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; - case BN: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); - case BS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; - case BW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; - case BE: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); - default: - WALBERLA_ABORT("Invalid Direction"); - } - - } - - static real_t getSymmetricPart( const stencil::Direction direction, - const Vector3<real_t> & u = Vector3< real_t >(real_t(0.0)), - real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - using namespace stencil; - switch( direction ) { - case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); - case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); - case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); - case W: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); - case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); - case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]); - case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]); - case NW: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]); - case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; - case SW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; - case SE: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]); - case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; - case TS: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]); - case TW: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]); - case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; - case BN: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]); - case BS: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; - case BW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; - case BE: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]); - default: - WALBERLA_ABORT("Invalid Direction"); - } - - } - - static real_t getAsymmetricPart( const stencil::Direction direction, - const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ), - real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - using namespace stencil; - switch( direction ) { - case C: return 0; - case N: return 0.166666666666667*u[1]; - case S: return -0.166666666666667*u[1]; - case W: return -0.166666666666667*u[0]; - case E: return 0.166666666666667*u[0]; - case T: return 0.166666666666667*u[2]; - case B: return -0.166666666666667*u[2]; - case NW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[1]; - case NE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[1]; - case SW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[1]; - case SE: return -0.0833333333333333*u[1] + 0.0833333333333333*u[0]; - case TN: return 0.0833333333333333*u[1] + 0.0833333333333333*u[2]; - case TS: return -0.0833333333333333*u[1] + 0.0833333333333333*u[2]; - case TW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[2]; - case TE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[2]; - case BN: return -0.0833333333333333*u[2] + 0.0833333333333333*u[1]; - case BS: return -0.0833333333333333*u[1] - 0.0833333333333333*u[2]; - case BW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[2]; - case BE: return -0.0833333333333333*u[2] + 0.0833333333333333*u[0]; - default: - WALBERLA_ABORT("Invalid Direction"); - } - - } - - static std::vector< real_t > get( const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ), - real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - std::vector< real_t > equilibrium( Stencil::Size ); - for( auto d = Stencil::begin(); d != Stencil::end(); ++d ) - { - equilibrium[d.toIdx()] = get(*d, u, rho); - } - return equilibrium; - } -}; - - -namespace internal { - -template<> -struct AdaptVelocityToForce<UniformGridGPU_LatticeModel, void> -{ - template< typename FieldPtrOrIterator > - static Vector3<real_t> get( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm, - const Vector3< real_t > & velocity, const real_t rho ) - { - auto x = it.x(); - auto y = it.y(); - auto z = it.z(); - - return velocity; - - } - - static Vector3<real_t> get( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm, - const Vector3< real_t > & velocity, const real_t rho ) - { - - return velocity; - - } -}; -} // namespace internal - - - -template<> -struct Equilibrium< UniformGridGPU_LatticeModel, void > -{ - - template< typename FieldPtrOrIterator > - static void set( FieldPtrOrIterator & it, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - it[0] = rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); - it[1] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; - it[2] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); - it[3] = rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); - it[4] = rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; - it[5] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; - it[6] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); - it[7] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; - it[8] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; - it[9] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; - it[10] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); - it[11] = rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; - it[12] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - it[13] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - it[14] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; - it[15] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); - it[16] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; - it[17] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; - it[18] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); - } - - template< typename PdfField_T > - static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) ) - { - - rho -= real_t(1.0); - - - real_t & xyz0 = pdf(x,y,z,0); - pdf.getF( &xyz0, 0)= rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); - pdf.getF( &xyz0, 1)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; - pdf.getF( &xyz0, 2)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); - pdf.getF( &xyz0, 3)= rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); - pdf.getF( &xyz0, 4)= rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; - pdf.getF( &xyz0, 5)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; - pdf.getF( &xyz0, 6)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); - pdf.getF( &xyz0, 7)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; - pdf.getF( &xyz0, 8)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; - pdf.getF( &xyz0, 9)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; - pdf.getF( &xyz0, 10)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); - pdf.getF( &xyz0, 11)= rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; - pdf.getF( &xyz0, 12)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - pdf.getF( &xyz0, 13)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; - pdf.getF( &xyz0, 14)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; - pdf.getF( &xyz0, 15)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); - pdf.getF( &xyz0, 16)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; - pdf.getF( &xyz0, 17)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; - pdf.getF( &xyz0, 18)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); - } -}; - - -template<> -struct Density<UniformGridGPU_LatticeModel, void> -{ - template< typename FieldPtrOrIterator > - static inline real_t get( const UniformGridGPU_LatticeModel & , const FieldPtrOrIterator & it ) - { - const real_t f_0 = it[0]; - const real_t f_1 = it[1]; - const real_t f_2 = it[2]; - const real_t f_3 = it[3]; - const real_t f_4 = it[4]; - const real_t f_5 = it[5]; - const real_t f_6 = it[6]; - const real_t f_7 = it[7]; - const real_t f_8 = it[8]; - const real_t f_9 = it[9]; - const real_t f_10 = it[10]; - const real_t f_11 = it[11]; - const real_t f_12 = it[12]; - const real_t f_13 = it[13]; - const real_t f_14 = it[14]; - const real_t f_15 = it[15]; - const real_t f_16 = it[16]; - const real_t f_17 = it[17]; - const real_t f_18 = it[18]; - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - return rho; - } - - template< typename PdfField_T > - static inline real_t get( const UniformGridGPU_LatticeModel & , - const PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) - { - const real_t & xyz0 = pdf(x,y,z,0); - const real_t f_0 = pdf.getF( &xyz0, 0); - const real_t f_1 = pdf.getF( &xyz0, 1); - const real_t f_2 = pdf.getF( &xyz0, 2); - const real_t f_3 = pdf.getF( &xyz0, 3); - const real_t f_4 = pdf.getF( &xyz0, 4); - const real_t f_5 = pdf.getF( &xyz0, 5); - const real_t f_6 = pdf.getF( &xyz0, 6); - const real_t f_7 = pdf.getF( &xyz0, 7); - const real_t f_8 = pdf.getF( &xyz0, 8); - const real_t f_9 = pdf.getF( &xyz0, 9); - const real_t f_10 = pdf.getF( &xyz0, 10); - const real_t f_11 = pdf.getF( &xyz0, 11); - const real_t f_12 = pdf.getF( &xyz0, 12); - const real_t f_13 = pdf.getF( &xyz0, 13); - const real_t f_14 = pdf.getF( &xyz0, 14); - const real_t f_15 = pdf.getF( &xyz0, 15); - const real_t f_16 = pdf.getF( &xyz0, 16); - const real_t f_17 = pdf.getF( &xyz0, 17); - const real_t f_18 = pdf.getF( &xyz0, 18); - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - return rho; - } -}; - - -template<> -struct DensityAndVelocity<UniformGridGPU_LatticeModel> -{ - template< typename FieldPtrOrIterator > - static void set( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) - { - auto x = it.x(); - auto y = it.y(); - auto z = it.z(); - - const double rho = rho_in - 1; - const double u_0 = u[0]; - const double u_1 = u[1]; - const double u_2 = u[2]; - - - Equilibrium<UniformGridGPU_LatticeModel>::set(it, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); - } - - template< typename PdfField_T > - static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) - { - const double rho = rho_in - 1; - const double u_0 = u[0]; - const double u_1 = u[1]; - const double u_2 = u[2]; - - - Equilibrium<UniformGridGPU_LatticeModel>::set(pdf, x, y, z, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); - } -}; - - -template<typename FieldIteratorXYZ > -struct DensityAndVelocityRange<UniformGridGPU_LatticeModel, FieldIteratorXYZ> -{ - - static void set( FieldIteratorXYZ & begin, const FieldIteratorXYZ & end, const UniformGridGPU_LatticeModel & lm, - const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) - { - for( auto cellIt = begin; cellIt != end; ++cellIt ) - { - const auto x = cellIt.x(); - const auto y = cellIt.y(); - const auto z = cellIt.z(); - const double rho = rho_in - 1; - const double u_0 = u[0]; - const double u_1 = u[1]; - const double u_2 = u[2]; - - - Equilibrium<UniformGridGPU_LatticeModel>::set(cellIt, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); - } - } -}; - - - -template<> -struct DensityAndMomentumDensity<UniformGridGPU_LatticeModel> -{ - template< typename FieldPtrOrIterator > - static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, - const FieldPtrOrIterator & it ) - { - const auto x = it.x(); - const auto y = it.y(); - const auto z = it.z(); - - const real_t f_0 = it[0]; - const real_t f_1 = it[1]; - const real_t f_2 = it[2]; - const real_t f_3 = it[3]; - const real_t f_4 = it[4]; - const real_t f_5 = it[5]; - const real_t f_6 = it[6]; - const real_t f_7 = it[7]; - const real_t f_8 = it[8]; - const real_t f_9 = it[9]; - const real_t f_10 = it[10]; - const real_t f_11 = it[11]; - const real_t f_12 = it[12]; - const real_t f_13 = it[13]; - const real_t f_14 = it[14]; - const real_t f_15 = it[15]; - const real_t f_16 = it[16]; - const real_t f_17 = it[17]; - const real_t f_18 = it[18]; - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; - const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; - const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; - momentumDensity[0] = md_0; - momentumDensity[1] = md_1; - momentumDensity[2] = md_2; - - return rho; - } - - template< typename PdfField_T > - static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf, - const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) - { - const real_t & xyz0 = pdf(x,y,z,0); - const real_t f_0 = pdf.getF( &xyz0, 0); - const real_t f_1 = pdf.getF( &xyz0, 1); - const real_t f_2 = pdf.getF( &xyz0, 2); - const real_t f_3 = pdf.getF( &xyz0, 3); - const real_t f_4 = pdf.getF( &xyz0, 4); - const real_t f_5 = pdf.getF( &xyz0, 5); - const real_t f_6 = pdf.getF( &xyz0, 6); - const real_t f_7 = pdf.getF( &xyz0, 7); - const real_t f_8 = pdf.getF( &xyz0, 8); - const real_t f_9 = pdf.getF( &xyz0, 9); - const real_t f_10 = pdf.getF( &xyz0, 10); - const real_t f_11 = pdf.getF( &xyz0, 11); - const real_t f_12 = pdf.getF( &xyz0, 12); - const real_t f_13 = pdf.getF( &xyz0, 13); - const real_t f_14 = pdf.getF( &xyz0, 14); - const real_t f_15 = pdf.getF( &xyz0, 15); - const real_t f_16 = pdf.getF( &xyz0, 16); - const real_t f_17 = pdf.getF( &xyz0, 17); - const real_t f_18 = pdf.getF( &xyz0, 18); - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; - const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; - const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; - momentumDensity[0] = md_0; - momentumDensity[1] = md_1; - momentumDensity[2] = md_2; - - return rho; - } -}; - - -template<> -struct MomentumDensity< UniformGridGPU_LatticeModel> -{ - template< typename FieldPtrOrIterator > - static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const FieldPtrOrIterator & it ) - { - const auto x = it.x(); - const auto y = it.y(); - const auto z = it.z(); - - const real_t f_0 = it[0]; - const real_t f_1 = it[1]; - const real_t f_2 = it[2]; - const real_t f_3 = it[3]; - const real_t f_4 = it[4]; - const real_t f_5 = it[5]; - const real_t f_6 = it[6]; - const real_t f_7 = it[7]; - const real_t f_8 = it[8]; - const real_t f_9 = it[9]; - const real_t f_10 = it[10]; - const real_t f_11 = it[11]; - const real_t f_12 = it[12]; - const real_t f_13 = it[13]; - const real_t f_14 = it[14]; - const real_t f_15 = it[15]; - const real_t f_16 = it[16]; - const real_t f_17 = it[17]; - const real_t f_18 = it[18]; - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; - const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; - const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; - momentumDensity[0] = md_0; - momentumDensity[1] = md_1; - momentumDensity[2] = md_2; - - } - - template< typename PdfField_T > - static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf, - const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) - { - const real_t & xyz0 = pdf(x,y,z,0); - const real_t f_0 = pdf.getF( &xyz0, 0); - const real_t f_1 = pdf.getF( &xyz0, 1); - const real_t f_2 = pdf.getF( &xyz0, 2); - const real_t f_3 = pdf.getF( &xyz0, 3); - const real_t f_4 = pdf.getF( &xyz0, 4); - const real_t f_5 = pdf.getF( &xyz0, 5); - const real_t f_6 = pdf.getF( &xyz0, 6); - const real_t f_7 = pdf.getF( &xyz0, 7); - const real_t f_8 = pdf.getF( &xyz0, 8); - const real_t f_9 = pdf.getF( &xyz0, 9); - const real_t f_10 = pdf.getF( &xyz0, 10); - const real_t f_11 = pdf.getF( &xyz0, 11); - const real_t f_12 = pdf.getF( &xyz0, 12); - const real_t f_13 = pdf.getF( &xyz0, 13); - const real_t f_14 = pdf.getF( &xyz0, 14); - const real_t f_15 = pdf.getF( &xyz0, 15); - const real_t f_16 = pdf.getF( &xyz0, 16); - const real_t f_17 = pdf.getF( &xyz0, 17); - const real_t f_18 = pdf.getF( &xyz0, 18); - const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; - const double vel1Term = f_1 + f_11 + f_15 + f_7; - const double vel2Term = f_12 + f_13 + f_5; - const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; - const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; - const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; - const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; - momentumDensity[0] = md_0; - momentumDensity[1] = md_1; - momentumDensity[2] = md_2; - - } -}; - - -template<> -struct PressureTensor<UniformGridGPU_LatticeModel> -{ - template< typename FieldPtrOrIterator > - static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */ ) - { - WALBERLA_ABORT("Not implemented"); - } - - template< typename PdfField_T > - static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const PdfField_T & /* pdf */, - const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */ ) - { - WALBERLA_ABORT("Not implemented"); - } -}; - - -template<> -struct ShearRate<UniformGridGPU_LatticeModel> -{ - template< typename FieldPtrOrIterator > - static inline real_t get( const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */, - const Vector3< real_t > & /* velocity */, const real_t /* rho */) - { - WALBERLA_ABORT("Not implemented"); - return real_t(0.0); - } - - template< typename PdfField_T > - static inline real_t get( const UniformGridGPU_LatticeModel & latticeModel, - const PdfField_T & /* pdf */, const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */, - const Vector3< real_t > & /* velocity */, const real_t /* rho */ ) - { - WALBERLA_ABORT("Not implemented"); - return real_t(0.0); - } - - static inline real_t get( const std::vector< real_t > & /* nonEquilibrium */, const real_t /* relaxationParam */, - const real_t /* rho */ = real_t(1) ) - { - WALBERLA_ABORT("Not implemented"); - return real_t(0.0); - } -}; - - -} // namespace lbm -} // namespace walberla - - - -#ifdef WALBERLA_CXX_COMPILER_IS_GNU -#pragma GCC diagnostic pop -#endif - -#ifdef WALBERLA_CXX_COMPILER_IS_CLANG -#pragma clang diagnostic pop -#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu deleted file mode 100644 index a650f8b3edae3b607ec4bb8f13329cab33e862ef..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu +++ /dev/null @@ -1,324 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file .cpp -//! \\ingroup lbm -//! \\author lbmpy -//====================================================================================================================== - -#include <cmath> - -#include "core/DataTypes.h" -#include "core/Macros.h" -#include "UniformGridGPU_LbKernel.h" - - -#define FUNC_PREFIX __global__ - -#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wfloat-equal" -# pragma GCC diagnostic ignored "-Wshadow" -# pragma GCC diagnostic ignored "-Wconversion" -#endif - -using namespace std; - -namespace walberla { -namespace pystencils { - -namespace internal_UniformGridGPU_LbKernel { -static FUNC_PREFIX void UniformGridGPU_LbKernel(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) -{ - if (blockDim.x*blockIdx.x + threadIdx.x + 1 < _size_pdfs_0 - 1 && blockDim.y*blockIdx.y + threadIdx.y + 1 < _size_pdfs_1 - 1 && blockDim.z*blockIdx.z + threadIdx.z + 1 < _size_pdfs_2 - 1) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x + 1; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y + 1; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z + 1; - double * const _data_pdfs_10_21_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; - const double xi_18 = -_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * const _data_pdfs_11_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - const double xi_19 = -_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * const _data_pdfs_11_21_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; - const double xi_20 = -_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_2m1_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; - double * const _data_pdfs_11_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - double * const _data_pdfs_1m1_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - double * const _data_pdfs_10_21_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; - double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - const double vel0Term = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - double * const _data_pdfs_1m1_2m1_311 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; - double * const _data_pdfs_1m1_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - double * const _data_pdfs_1m1_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - double * const _data_pdfs_1m1_21_315 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; - const double vel1Term = _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_2m1_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; - double * const _data_pdfs_11_2m1_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; - double * const _data_pdfs_10_2m1_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; - const double vel2Term = _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0] + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; - double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - double * const _data_pdfs_11_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - double * const _data_pdfs_10_21_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; - const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; - const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - const double xi_23 = (u_0*u_0); - const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] - _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - const double xi_21 = -u_1; - const double xi_24 = (u_1*u_1); - const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; - const double xi_22 = -u_2; - const double xi_25 = (u_2*u_2); - const double u0Mu1 = u_0 + xi_21; - const double u0Pu1 = u_0 + u_1; - const double u1Pu2 = u_1 + u_2; - const double u1Mu2 = u_1 + xi_22; - const double u0Mu2 = u_0 + xi_22; - const double u0Pu2 = u_0 + u_2; - const double f_eq_common = rho - xi_23 - xi_24 - xi_25; - const double xi_26 = f_eq_common + rho*-0.666666666666667; - const double xi_27 = f_eq_common + rho*-0.333333333333333; - const double xi_28 = xi_25 + xi_27; - const double xi_29 = xi_23 + xi_27; - const double xi_30 = xi_24 + xi_27; - const double xi_2 = xi_24*2 + xi_26; - const double xi_3 = xi_23*2 + xi_26; - const double xi_4 = xi_25*2 + xi_26; - const double xi_6 = u0Mu1*2; - const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; - const double xi_8 = u0Pu1*2; - const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; - const double xi_10 = u1Pu2*2; - const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; - const double xi_12 = u1Mu2*2; - const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; - const double xi_14 = u0Mu2*2; - const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; - const double xi_16 = u0Pu2*2; - const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; - const double xi_1 = omega*0.166666666666667; - const double xi_5 = omega*0.0416666666666667; - double * _data_pdfs_tmp_10_20_30 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; - _data_pdfs_tmp_10_20_30[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_31 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - _data_pdfs_tmp_10_20_31[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_32 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - _data_pdfs_tmp_10_20_32[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_11_20_32[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_33 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - _data_pdfs_tmp_10_20_33[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_34 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - _data_pdfs_tmp_10_20_34[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_35 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; - _data_pdfs_tmp_10_20_35[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_36 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; - _data_pdfs_tmp_10_20_36[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_10_21_36[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_37 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_pdfs_tmp_10_20_37[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_38 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_pdfs_tmp_10_20_38[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_39 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_pdfs_tmp_10_20_39[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_310 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_pdfs_tmp_10_20_310[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_311 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_pdfs_tmp_10_20_311[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_312 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_pdfs_tmp_10_20_312[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_313 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_pdfs_tmp_10_20_313[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_314 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_pdfs_tmp_10_20_314[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_315 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_pdfs_tmp_10_20_315[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_316 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_pdfs_tmp_10_20_316[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; - double * _data_pdfs_tmp_10_20_317 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_pdfs_tmp_10_20_317[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; - double * _data_pdfs_tmp_10_20_318 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_pdfs_tmp_10_20_318[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; - } -} -} - -void UniformGridGPU_LbKernel::operator() ( IBlock * block , cudaStream_t stream ) -{ - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - cuda::GPUField<double> * pdfs_tmp; - // Getting temporary field pdfs_tmp - auto it = cache_pdfs_.find( pdfs ); - if( it != cache_pdfs_.end() ) - { - pdfs_tmp = *it; - } - else - { - pdfs_tmp = pdfs->cloneUninitialized(); - cache_pdfs_.insert(pdfs_tmp); - } - - WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(-1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers())); - double * _data_pdfs_tmp = pdfs_tmp->dataAt(-1, 0, 0, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->xSize()) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(pdfs->xSize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->ySize()) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(pdfs->ySize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(pdfs->zSize()) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(pdfs->zSize()) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); - dim3 _grid(int(( (_size_pdfs_0 - 2) % (((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % (((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % (((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); - internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); - pdfs->swapDataPointers(pdfs_tmp); - -} - - - -void UniformGridGPU_LbKernel::inner( IBlock * block , cudaStream_t stream ) -{ - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - cuda::GPUField<double> * pdfs_tmp; - // Getting temporary field pdfs_tmp - auto it = cache_pdfs_.find( pdfs ); - if( it != cache_pdfs_.end() ) - { - pdfs_tmp = *it; - } - else - { - pdfs_tmp = pdfs->cloneUninitialized(); - cache_pdfs_.insert(pdfs_tmp); - } - - - CellInterval inner = pdfs->xyzSize(); - inner.expand(-1); - - WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0); - WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - double * _data_pdfs_tmp = pdfs_tmp->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(inner.xSize()) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(inner.xSize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(inner.ySize()) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(inner.ySize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(inner.zSize()) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(inner.zSize()) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); - dim3 _grid(int(( (_size_pdfs_0 - 2) % (((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % (((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % (((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); - internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); -} - - -void UniformGridGPU_LbKernel::outer( IBlock * block , cudaStream_t stream ) -{ - static std::vector<CellInterval> layers; - - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - cuda::GPUField<double> * pdfs_tmp; - // Getting temporary field pdfs_tmp - auto it = cache_pdfs_.find( pdfs ); - if( it != cache_pdfs_.end() ) - { - pdfs_tmp = *it; - } - else - { - pdfs_tmp = pdfs->cloneUninitialized(); - cache_pdfs_.insert(pdfs_tmp); - } - - - if( layers.size() == 0 ) - { - CellInterval ci; - - pdfs->getSliceBeforeGhostLayer(stencil::T, ci, 1, false); - layers.push_back(ci); - pdfs->getSliceBeforeGhostLayer(stencil::B, ci, 1, false); - layers.push_back(ci); - - pdfs->getSliceBeforeGhostLayer(stencil::N, ci, 1, false); - ci.expand(Cell(0, 0, -1)); - layers.push_back(ci); - pdfs->getSliceBeforeGhostLayer(stencil::S, ci, 1, false); - ci.expand(Cell(0, 0, -1)); - layers.push_back(ci); - - pdfs->getSliceBeforeGhostLayer(stencil::E, ci, 1, false); - ci.expand(Cell(0, -1, -1)); - layers.push_back(ci); - pdfs->getSliceBeforeGhostLayer(stencil::W, ci, 1, false); - ci.expand(Cell(0, -1, -1)); - layers.push_back(ci); - } - - - { - auto parallelSection_ = parallelStreams_.parallelSection( stream ); - for( auto & ci: layers ) - { - parallelSection_.run([&]( auto s ) { - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0); - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); - double * _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 2)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 2)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 2); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 2)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 2); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); - dim3 _grid(int(( (_size_pdfs_0 - 2) % (((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % (((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % (((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); - internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, s>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); - }); - } - } - - - pdfs->swapDataPointers(pdfs_tmp); - -} - - -} // namespace pystencils -} // namespace walberla - - -#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) -# pragma GCC diagnostic pop -#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h deleted file mode 100644 index def06eb548c5ad007b9408dd56e178ce9609a5b0..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h +++ /dev/null @@ -1,91 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file UniformGridGPU_LbKernel.h -//! \\author pystencils -//====================================================================================================================== - -#include "core/DataTypes.h" - -#include "cuda/GPUField.h" -#include "cuda/ParallelStreams.h" -#include "field/SwapableCompare.h" -#include "domain_decomposition/BlockDataID.h" -#include "domain_decomposition/IBlock.h" - -#include <set> - -#ifdef __GNUC__ -#define RESTRICT __restrict__ -#elif _MSC_VER -#define RESTRICT __restrict -#else -#define RESTRICT -#endif - -#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wunused-parameter" -#endif - -namespace walberla { -namespace pystencils { - - -class UniformGridGPU_LbKernel -{ -public: - UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_) - : pdfsID(pdfsID_), omega(omega_) - {}; - - - ~UniformGridGPU_LbKernel() { - for(auto p: cache_pdfs_) { - delete p; - } - } - - - - void operator() ( IBlock * block , cudaStream_t stream = 0 ); - - void inner( IBlock * block , cudaStream_t stream = 0 ); - void outer( IBlock * block , cudaStream_t stream = 0 ); - - void setOuterPriority(int priority ) { - - parallelStreams_.setStreamPriority(priority); - - } -private: - BlockDataID pdfsID; - double omega; - - std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_; - - - cuda::ParallelStreams parallelStreams_; - -}; - - -} // namespace pystencils -} // namespace walberla - - -#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) -# pragma GCC diagnostic pop -#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu deleted file mode 100644 index acabe1d0c9a5fa59aa15a9f32b15a41b25190a27..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu +++ /dev/null @@ -1,132 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file UniformGridGPU_NoSlip.cpp -//! \\ingroup lbm -//! \\author lbmpy -//====================================================================================================================== - -#include <cmath> - -#include "core/DataTypes.h" -#include "core/Macros.h" -#include "UniformGridGPU_NoSlip.h" -#include "cuda/ErrorChecking.h" - - -#define FUNC_PREFIX __global__ - -using namespace std; - -namespace walberla { -namespace lbm { - -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wconversion" -#endif - -#ifdef __CUDACC__ -#pragma push -#pragma diag_suppress = declared_but_not_referenced -#endif - - -namespace internal_boundary_UniformGridGPU_NoSlip { -static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize) - { - uint8_t * const _data_indexVector_10 = _data_indexVector; - const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - uint8_t * const _data_indexVector_14 = _data_indexVector + 4; - const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - uint8_t * const _data_indexVector_18 = _data_indexVector + 8; - const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - - - const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 }; - const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 }; - const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 }; - const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 }; - - - const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; - - uint8_t * const _data_indexVector_112 = _data_indexVector + 12; - const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - double * _data_pdfsf9cc34cc4e2b6261 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir]; - double * _data_pdfs_10_2011ac6bf6446d4afa = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir; - _data_pdfsf9cc34cc4e2b6261[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_2011ac6bf6446d4afa[_stride_pdfs_0*x]; - } -} -} - -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - -#ifdef __CUDACC__ -#pragma pop -#endif - - -void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream ) -{ - auto * indexVectors = block->getData<IndexVectors>(indexVectorID); - - auto pointer = indexVectors->pointerGpu(type); - - - int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); - if( indexVectorSize == 0) - return; - - uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); - - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - - WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1)); - dim3 _grid(int(( (indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1)); - internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); -} - -void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::ALL, stream ); -} - -void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::INNER, stream ); -} - -void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::OUTER, stream ); -} - - -} // namespace lbm -} // namespace walberla - diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h deleted file mode 100644 index fa64a19841e63f8c1a579e420a819f5bd4644153..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h +++ /dev/null @@ -1,364 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file UniformGridGPU_NoSlip.h -//! \\author pystencils -//====================================================================================================================== - - -#include "core/DataTypes.h" - -#include "cuda/GPUField.h" -#include "domain_decomposition/BlockDataID.h" -#include "domain_decomposition/IBlock.h" -#include "blockforest/StructuredBlockForest.h" -#include "field/FlagField.h" - -#include <set> -#include <vector> - -#ifdef __GNUC__ -#define RESTRICT __restrict__ -#elif _MSC_VER -#define RESTRICT __restrict -#else -#define RESTRICT -#endif - -namespace walberla { -namespace lbm { - - -class UniformGridGPU_NoSlip -{ -public: - struct IndexInfo { - int32_t x; - int32_t y; - int32_t z; - int32_t dir; - IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} - bool operator==(const IndexInfo & o) const { - return x == o.x && y == o.y && z == o.z && dir == o.dir; - } - }; - - - - class IndexVectors - { - public: - using CpuIndexVector = std::vector<IndexInfo>; - - enum Type { - ALL = 0, - INNER = 1, - OUTER = 2, - NUM_TYPES = 3 - }; - - IndexVectors() : cpuVectors_(NUM_TYPES) {} - bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; } - - ~IndexVectors() { - for( auto & gpuVec: gpuVectors_) - cudaFree( gpuVec ); - } - - - CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } - IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); } - - IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; } - - - void syncGPU() - { - gpuVectors_.resize( cpuVectors_.size() ); - for(size_t i=0; i < size_t(NUM_TYPES); ++i ) - { - auto & gpuVec = gpuVectors_[i]; - auto & cpuVec = cpuVectors_[i]; - cudaFree( gpuVec ); - cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() ); - cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice ); - } - } - - private: - std::vector<CpuIndexVector> cpuVectors_; - - using GpuIndexVector = IndexInfo *; - std::vector<GpuIndexVector> gpuVectors_; - - }; - - - UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks, - BlockDataID pdfsID_ ) - : pdfsID(pdfsID_) - { - auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; - indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip"); - }; - - void operator() ( IBlock * block , cudaStream_t stream = 0 ); - void inner( IBlock * block , cudaStream_t stream = 0 ); - void outer( IBlock * block , cudaStream_t stream = 0 ); - - - template<typename FlagField_T> - void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, - FlagUID boundaryFlagUID, FlagUID domainFlagUID) - { - for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) - fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); - } - - - template<typename FlagField_T> - void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID, - FlagUID boundaryFlagUID, FlagUID domainFlagUID ) - { - auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); - auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); - auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); - auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); - - - auto * flagField = block->getData< FlagField_T > ( flagFieldID ); - - auto boundaryFlag = flagField->getFlag(boundaryFlagUID); - auto domainFlag = flagField->getFlag(domainFlagUID); - - auto inner = flagField->xyzSize(); - inner.expand( cell_idx_t(-1) ); - - - indexVectorAll.clear(); - indexVectorInner.clear(); - indexVectorOuter.clear(); - - for( auto it = flagField->begin(); it != flagField->end(); ++it ) - { - if( ! isFlagSet(it, domainFlag) ) - continue; - if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - } - - indexVectors->syncGPU(); - } - -private: - void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 ); - - BlockDataID indexVectorID; - - BlockDataID pdfsID; -}; - - - -} // namespace lbm -} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu deleted file mode 100644 index 27df1f0d920ad163063462c604c849578e10caee..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu +++ /dev/null @@ -1,1656 +0,0 @@ -#include "stencil/Directions.h" -#include "core/cell/CellInterval.h" -#include "cuda/GPUField.h" -#include "core/DataTypes.h" -#include "UniformGridGPU_PackInfo.h" - - -#define FUNC_PREFIX __global__ - - -namespace walberla { -namespace pystencils { - -using walberla::cell::CellInterval; -using walberla::stencil::Direction; - - - -namespace internal_pack_SW { -static FUNC_PREFIX void pack_SW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_BW { -static FUNC_PREFIX void pack_BW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_W { -static FUNC_PREFIX void pack_W(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_TW { -static FUNC_PREFIX void pack_TW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_NW { -static FUNC_PREFIX void pack_NW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_BS { -static FUNC_PREFIX void pack_BS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_S { -static FUNC_PREFIX void pack_S(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_TS { -static FUNC_PREFIX void pack_TS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_B { -static FUNC_PREFIX void pack_B(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_C { -static FUNC_PREFIX void pack_C(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_T { -static FUNC_PREFIX void pack_T(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_BN { -static FUNC_PREFIX void pack_BN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_N { -static FUNC_PREFIX void pack_N(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_TN { -static FUNC_PREFIX void pack_TN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_SE { -static FUNC_PREFIX void pack_SE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_BE { -static FUNC_PREFIX void pack_BE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_E { -static FUNC_PREFIX void pack_E(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0]; - double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_TE { -static FUNC_PREFIX void pack_TE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; - } -} -} - -namespace internal_pack_NE { -static FUNC_PREFIX void pack_NE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; - } -} -} - - - -namespace internal_unpack_NE { -static FUNC_PREFIX void unpack_NE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_TE { -static FUNC_PREFIX void unpack_TE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_E { -static FUNC_PREFIX void unpack_E(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; - _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_BE { -static FUNC_PREFIX void unpack_BE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_SE { -static FUNC_PREFIX void unpack_SE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_TN { -static FUNC_PREFIX void unpack_TN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_N { -static FUNC_PREFIX void unpack_N(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; - _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; - _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_BN { -static FUNC_PREFIX void unpack_BN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_T { -static FUNC_PREFIX void unpack_T(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; - _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; - _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; - _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_C { -static FUNC_PREFIX void unpack_C(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; - _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_B { -static FUNC_PREFIX void unpack_B(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; - _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; - _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; - _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_TS { -static FUNC_PREFIX void unpack_TS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_S { -static FUNC_PREFIX void unpack_S(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; - _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; - _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; - _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_BS { -static FUNC_PREFIX void unpack_BS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; - _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_NW { -static FUNC_PREFIX void unpack_NW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_TW { -static FUNC_PREFIX void unpack_TW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_W { -static FUNC_PREFIX void unpack_W(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; - _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; - double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; - double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; - _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; - double * _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; - _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; - double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; - } -} -} - -namespace internal_unpack_BW { -static FUNC_PREFIX void unpack_BW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; - _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - -namespace internal_unpack_SW { -static FUNC_PREFIX void unpack_SW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) - { - const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; - const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; - const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; - double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; - _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; - } -} -} - - - - -void UniformGridGPU_PackInfo::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream) -{ - double * buffer = reinterpret_cast<double*>(byte_buffer); - - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - - CellInterval ci; - pdfs->getSliceBeforeGhostLayer(dir, ci, 1, false); - - switch( dir ) - { - case stencil::SW: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_SW::pack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BW: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_BW::pack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::W: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_W::pack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TW: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_TW::pack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::NW: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_NW::pack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BS: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_BS::pack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::S: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_S::pack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TS: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_TS::pack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::B: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_B::pack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::C: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_C::pack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2); - break; - } - - case stencil::T: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_T::pack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BN: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_BN::pack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::N: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_N::pack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TN: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_TN::pack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::SE: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_SE::pack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BE: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_BE::pack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::E: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_E::pack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TE: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_TE::pack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::NE: - { - double * _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_pack_NE::pack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - - default: - WALBERLA_ASSERT(false); - } -} - - -void UniformGridGPU_PackInfo::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream) -{ - double * buffer = reinterpret_cast<double*>(byte_buffer); - - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - - CellInterval ci; - pdfs->getGhostRegion(dir, ci, 1, false); - - switch( dir ) - { - case stencil::NE: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_NE::unpack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TE: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_TE::unpack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::E: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_E::unpack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BE: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_BE::unpack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::SE: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_SE::unpack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TN: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_TN::unpack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::N: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_N::unpack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BN: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_BN::unpack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::T: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_T::unpack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::C: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_C::unpack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2); - break; - } - - case stencil::B: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_B::unpack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TS: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_TS::unpack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::S: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_S::unpack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BS: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_BS::unpack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::NW: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_NW::unpack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::TW: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_TW::unpack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::W: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_W::unpack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::BW: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_BW::unpack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - case stencil::SW: - { - double * const _data_buffer = buffer; - WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); - WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(cell_idx_c(ci.xSize()) + 0)); - const int64_t _size_pdfs_0 = int64_t(cell_idx_c(ci.xSize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(cell_idx_c(ci.ySize()) + 0)); - const int64_t _size_pdfs_1 = int64_t(cell_idx_c(ci.ySize()) + 0); - WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(cell_idx_c(ci.zSize()) + 0)); - const int64_t _size_pdfs_2 = int64_t(cell_idx_c(ci.zSize()) + 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); - dim3 _grid(int(( (_size_pdfs_0) % (((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % (((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % (((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); - internal_unpack_SW::unpack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); - break; - } - - - default: - WALBERLA_ASSERT(false); - } -} - - -uint_t UniformGridGPU_PackInfo::size(stencil::Direction dir, IBlock * block) -{ - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - - CellInterval ci; - pdfs->getGhostRegion(dir, ci, 1, false); - - uint_t elementsPerCell = 0; - - switch( dir ) - { - case stencil::SW: - elementsPerCell = 1; - break; - - case stencil::BW: - elementsPerCell = 1; - break; - - case stencil::W: - elementsPerCell = 5; - break; - - case stencil::TW: - elementsPerCell = 1; - break; - - case stencil::NW: - elementsPerCell = 1; - break; - - case stencil::BS: - elementsPerCell = 1; - break; - - case stencil::S: - elementsPerCell = 5; - break; - - case stencil::TS: - elementsPerCell = 1; - break; - - case stencil::B: - elementsPerCell = 5; - break; - - case stencil::C: - elementsPerCell = 1; - break; - - case stencil::T: - elementsPerCell = 5; - break; - - case stencil::BN: - elementsPerCell = 1; - break; - - case stencil::N: - elementsPerCell = 5; - break; - - case stencil::TN: - elementsPerCell = 1; - break; - - case stencil::SE: - elementsPerCell = 1; - break; - - case stencil::BE: - elementsPerCell = 1; - break; - - case stencil::E: - elementsPerCell = 5; - break; - - case stencil::TE: - elementsPerCell = 1; - break; - - case stencil::NE: - elementsPerCell = 1; - break; - - default: - elementsPerCell = 0; - } - return ci.numCells() * elementsPerCell * sizeof( double ); -} - - - -} // namespace pystencils -} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h deleted file mode 100644 index c68a7b063fd2585cead948d771e7f8e012fccbda..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h +++ /dev/null @@ -1,34 +0,0 @@ -#include "stencil/Directions.h" -#include "core/cell/CellInterval.h" -#include "cuda/GPUField.h" -#include "core/DataTypes.h" -#include "domain_decomposition/IBlock.h" -#include "cuda/communication/GeneratedGPUPackInfo.h" - - -#define FUNC_PREFIX __global__ - - -namespace walberla { -namespace pystencils { - - -class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo -{ -public: - UniformGridGPU_PackInfo( BlockDataID pdfsID_ ) - : pdfsID(pdfsID_) - {}; - virtual ~UniformGridGPU_PackInfo() {} - - virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); - virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); - virtual uint_t size (stencil::Direction dir, IBlock * block); - -private: - BlockDataID pdfsID; -}; - - -} // namespace pystencils -} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu deleted file mode 100644 index b6fcbbe3cadcf4093451d091f60c5dae181f0db5..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu +++ /dev/null @@ -1,132 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file UniformGridGPU_UBB.cpp -//! \\ingroup lbm -//! \\author lbmpy -//====================================================================================================================== - -#include <cmath> - -#include "core/DataTypes.h" -#include "core/Macros.h" -#include "UniformGridGPU_UBB.h" -#include "cuda/ErrorChecking.h" - - -#define FUNC_PREFIX __global__ - -using namespace std; - -namespace walberla { -namespace lbm { - -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wunused-variable" -#pragma GCC diagnostic ignored "-Wconversion" -#endif - -#ifdef __CUDACC__ -#pragma push -#pragma diag_suppress = declared_but_not_referenced -#endif - - -namespace internal_boundary_UniformGridGPU_UBB { -static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize) -{ - if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize) - { - uint8_t * const _data_indexVector_10 = _data_indexVector; - const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - uint8_t * const _data_indexVector_14 = _data_indexVector + 4; - const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - uint8_t * const _data_indexVector_18 = _data_indexVector + 8; - const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - - - const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 }; - const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 }; - const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 }; - const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 }; - - - const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; - - uint8_t * const _data_indexVector_112 = _data_indexVector + 12; - const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); - double * _data_pdfsf9cc34cc4e2b6261 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir]; - double * _data_pdfs_10_2011ac6bf6446d4afa = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir; - _data_pdfsf9cc34cc4e2b6261[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_2011ac6bf6446d4afa[_stride_pdfs_0*x]; - } -} -} - -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - -#ifdef __CUDACC__ -#pragma pop -#endif - - -void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream ) -{ - auto * indexVectors = block->getData<IndexVectors>(indexVectorID); - - auto pointer = indexVectors->pointerGpu(type); - - - int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); - if( indexVectorSize == 0) - return; - - uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); - - auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); - - WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())); - double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0); - const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); - const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); - const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); - const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); - dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1)); - dim3 _grid(int(( (indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1)); - internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); -} - -void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::ALL, stream ); -} - -void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::INNER, stream ); -} - -void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream ) -{ - run( block, IndexVectors::OUTER, stream ); -} - - -} // namespace lbm -} // namespace walberla - diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h deleted file mode 100644 index 0b0017759462d2120752acf7d2db2b85ec85b7b9..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h +++ /dev/null @@ -1,364 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \\file UniformGridGPU_UBB.h -//! \\author pystencils -//====================================================================================================================== - - -#include "core/DataTypes.h" - -#include "cuda/GPUField.h" -#include "domain_decomposition/BlockDataID.h" -#include "domain_decomposition/IBlock.h" -#include "blockforest/StructuredBlockForest.h" -#include "field/FlagField.h" - -#include <set> -#include <vector> - -#ifdef __GNUC__ -#define RESTRICT __restrict__ -#elif _MSC_VER -#define RESTRICT __restrict -#else -#define RESTRICT -#endif - -namespace walberla { -namespace lbm { - - -class UniformGridGPU_UBB -{ -public: - struct IndexInfo { - int32_t x; - int32_t y; - int32_t z; - int32_t dir; - IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} - bool operator==(const IndexInfo & o) const { - return x == o.x && y == o.y && z == o.z && dir == o.dir; - } - }; - - - - class IndexVectors - { - public: - using CpuIndexVector = std::vector<IndexInfo>; - - enum Type { - ALL = 0, - INNER = 1, - OUTER = 2, - NUM_TYPES = 3 - }; - - IndexVectors() : cpuVectors_(NUM_TYPES) {} - bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; } - - ~IndexVectors() { - for( auto & gpuVec: gpuVectors_) - cudaFree( gpuVec ); - } - - - CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } - IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); } - - IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; } - - - void syncGPU() - { - gpuVectors_.resize( cpuVectors_.size() ); - for(size_t i=0; i < size_t(NUM_TYPES); ++i ) - { - auto & gpuVec = gpuVectors_[i]; - auto & cpuVec = cpuVectors_[i]; - cudaFree( gpuVec ); - cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() ); - cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice ); - } - } - - private: - std::vector<CpuIndexVector> cpuVectors_; - - using GpuIndexVector = IndexInfo *; - std::vector<GpuIndexVector> gpuVectors_; - - }; - - - UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks, - BlockDataID pdfsID_ ) - : pdfsID(pdfsID_) - { - auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; - indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB"); - }; - - void operator() ( IBlock * block , cudaStream_t stream = 0 ); - void inner( IBlock * block , cudaStream_t stream = 0 ); - void outer( IBlock * block , cudaStream_t stream = 0 ); - - - template<typename FlagField_T> - void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, - FlagUID boundaryFlagUID, FlagUID domainFlagUID) - { - for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) - fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); - } - - - template<typename FlagField_T> - void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID, - FlagUID boundaryFlagUID, FlagUID domainFlagUID ) - { - auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); - auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); - auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); - auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); - - - auto * flagField = block->getData< FlagField_T > ( flagFieldID ); - - auto boundaryFlag = flagField->getFlag(boundaryFlagUID); - auto domainFlag = flagField->getFlag(domainFlagUID); - - auto inner = flagField->xyzSize(); - inner.expand( cell_idx_t(-1) ); - - - indexVectorAll.clear(); - indexVectorInner.clear(); - indexVectorOuter.clear(); - - for( auto it = flagField->begin(); it != flagField->end(); ++it ) - { - if( ! isFlagSet(it, domainFlag) ) - continue; - if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) - { - auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); - indexVectorAll.push_back( element ); - if( inner.contains( it.x(), it.y(), it.z() ) ) - indexVectorInner.push_back( element ); - else - indexVectorOuter.push_back( element ); - } - - } - - indexVectors->syncGPU(); - } - -private: - void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 ); - - BlockDataID indexVectorID; - - BlockDataID pdfsID; -}; - - - -} // namespace lbm -} // namespace walberla \ No newline at end of file diff --git a/cmake/waLBerlaFunctions.cmake b/cmake/waLBerlaFunctions.cmake index 196d3f4777ad4b4efd6d688f5c99361c107ef2a6..d62c6824b202820eabda59bc3bd61d463ac6f31e 100644 --- a/cmake/waLBerlaFunctions.cmake +++ b/cmake/waLBerlaFunctions.cmake @@ -245,6 +245,23 @@ endfunction ( waLBerla_add_executable ) +####################################################################################################################### +# +# Function to tell CMake which C/C++/CUDA files are generated by a python file +# +# Example: +# waLBerla_python_file_generates(MyPythonCodeGenScript.py Sweep1.cpp Sweep1.h Sweep2.h Sweep2.cu) +# +# +####################################################################################################################### +function( waLBerla_python_file_generates pythonFile ) + get_filename_component(pythonFileAbsolutePath ${pythonFile} ABSOLUTE) + set( "WALBERLA_CODEGEN_INFO_${pythonFileAbsolutePath}" ${ARGN} + CACHE INTERNAL "Files generated by python script ${pythonFile}" FORCE) +endfunction(waLBerla_python_file_generates) + + + ####################################################################################################################### # # Adds a waLBerla module test executable. diff --git a/cmake/waLBerlaHelperFunctions.cmake b/cmake/waLBerlaHelperFunctions.cmake index 4d1dafe0e5ce02f35d55151550750fcf3dcdcef2..5268c2ab1b2c20071ea31658faf72cebef738a1d 100644 --- a/cmake/waLBerlaHelperFunctions.cmake +++ b/cmake/waLBerlaHelperFunctions.cmake @@ -40,11 +40,10 @@ function( handle_python_codegen sourceFilesOut generatedSourceFilesOut generator if( ${sourceFile} MATCHES ".*\\.py$" ) set(codeGenRequired YES) if( WALBERLA_BUILD_WITH_CODEGEN) - execute_process(COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} -l - OUTPUT_VARIABLE generatedSourceFiles) - string(REGEX REPLACE "\n$" "" generatedSourceFiles "${generatedSourceFiles}") + get_filename_component(pythonFileAbsolutePath ${sourceFile} ABSOLUTE ) + set( generatedSourceFiles ${WALBERLA_CODEGEN_INFO_${pythonFileAbsolutePath}} ) - set(generatedWithAbsolutePath ) + set( generatedWithAbsolutePath ) foreach( filename ${generatedSourceFiles} ) list(APPEND generatedWithAbsolutePath ${CMAKE_CURRENT_BINARY_DIR}/${filename}) endforeach() @@ -52,9 +51,19 @@ function( handle_python_codegen sourceFilesOut generatedSourceFilesOut generator list(APPEND generatedResult ${generatedWithAbsolutePath} ) list(APPEND generatorsResult ${sourceFile} ) + string (REPLACE ";" "\", \"" jsonFileList "${generatedWithAbsolutePath}" ) + set(pythonParameters + "{\"EXPECTED_FILES\": [\"${jsonFileList}\"], \"CMAKE_VARS\" : { " + "\"WALBERLA_OPTIMIZE_FOR_LOCALHOST\": \"${WALBERLA_OPTIMIZE_FOR_LOCALHOST}\"," + "\"WALBERLA_DOUBLE_ACCURACY\": \"${WALBERLA_DOUBLE_ACCURACY}\"," + "\"WALBERLA_BUILD_WITH_MPI\": \"${WALBERLA_BUILD_WITH_MPI}\"," + "\"WALBERLA_BUILD_WITH_OPENMP\": \"${WALBERLA_BUILD_WITH_OPENMP}\" } }" + ) + string(REPLACE "\"" "\\\"" pythonParameters ${pythonParameters}) # even one more quoting level required + string(REPLACE "\n" "" pythonParameters ${pythonParameters}) # remove newline characters add_custom_command(OUTPUT ${generatedWithAbsolutePath} DEPENDS ${sourceFile} - COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} -g + COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} ${pythonParameters} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index dac2f4a1c3a34e54fbeaa6eedfd894d65be0acce..11213e0111459bdc2aaf9bdeb719a37e3cb255a8 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -19,6 +19,10 @@ waLBerla_execute_test( NAME SimpleKernelTest ) waLBerla_compile_test( FILES FieldIndexing3DTest.cpp FieldIndexing3DTest.cu ) waLBerla_execute_test( NAME FieldIndexing3DTest ) + +waLBerla_python_file_generates(codegen/CudaJacobiKernel.py + CudaJacobiKernel2D.cu CudaJacobiKernel2D.h + CudaJacobiKernel3D.cu CudaJacobiKernel3D.h) waLBerla_compile_test( FILES codegen/CodegenJacobiGPU.cpp codegen/CudaJacobiKernel.py DEPENDS blockforest timeloop gui ) @@ -34,8 +38,8 @@ waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui ) waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop ) -waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py) -waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest - FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py - DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui ) +waLBerla_python_file_generates(codegen/MicroBenchmarkGpuLbm.py + MicroBenchmarkStreamKernel.cu MicroBenchmarkStreamKernel.h + MicroBenchmarkCopyKernel.cu MicroBenchmarkCopyKernel.h) +waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py) diff --git a/tests/cuda/codegen/CudaJacobiKernel.py b/tests/cuda/codegen/CudaJacobiKernel.py index 14e46d2b6ef017b8728b0106e480c3e56301ac8e..7ec84032a5c7941ddfee67f1484a08dca2014193 100644 --- a/tests/cuda/codegen/CudaJacobiKernel.py +++ b/tests/cuda/codegen/CudaJacobiKernel.py @@ -1,20 +1,26 @@ -from pystencils_walberla.sweep import Sweep +import sympy as sp +import pystencils as ps +from pystencils_walberla import CodeGeneration, generate_sweep -def jacobi2D(sweep): - src = sweep.field("f1") - dst = sweep.temporary_field(src) +with CodeGeneration() as ctx: + h = sp.symbols("h") - dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * sweep.constant("h") ** 2) + # ----- Jacobi 2D - created by specifying weights in nested list -------------------------- + src, dst = ps.fields("src, src_tmp: [2D]") + stencil = [[0, -1, 0], + [-1, 4, -1], + [0, -1, 0]] + assignments = ps.assignment_from_stencil(stencil, src, dst, normalization_factor=4 * h**2) + generate_sweep(ctx, 'CudaJacobiKernel2D', assignments, field_swaps=[(src, dst)], target="gpu") + # ----- Jacobi 3D - created by using kernel_decorator with assignments in '@=' format ----- + src, dst = ps.fields("src, src_tmp: [3D]") -def jacobi3D(sweep): - src = sweep.field("f1") - dst = sweep.temporary_field(src) + @ps.kernel + def kernel_func(): + dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + + src[0, 1, 0] + src[0, -1, 0] + + src[0, 0, 1] + src[0, 0, -1]) / (6 * h ** 2) - dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + src[0, 1, 0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0, -1]) / \ - (6 * sweep.constant("h") ** 2) - - -Sweep.generate('CudaJacobiKernel2D', jacobi2D, dim=2, target='gpu') -Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu') + generate_sweep(ctx, 'CudaJacobiKernel3D', kernel_func, field_swaps=[(src, dst)], target="gpu") diff --git a/tests/cuda/codegen/EquivalenceTest.cpp b/tests/cuda/codegen/EquivalenceTest.cpp deleted file mode 100644 index 75bbd273ef043a2b3efecc9c17256d11fec2ad29..0000000000000000000000000000000000000000 --- a/tests/cuda/codegen/EquivalenceTest.cpp +++ /dev/null @@ -1,192 +0,0 @@ -#include "core/Environment.h" -#include "python_coupling/CreateConfig.h" -#include "blockforest/Initialization.h" -#include "lbm/field/PdfField.h" -#include "lbm/field/AddToStorage.h" -#include "field/FlagField.h" -#include "field/AddToStorage.h" -#include "lbm/communication/PdfFieldPackInfo.h" -#include "lbm/PerformanceLogger.h" -#include "blockforest/communication/UniformBufferedScheme.h" -#include "timeloop/all.h" -#include "core/math/Random.h" -#include "geometry/all.h" -#include "cuda/HostFieldAllocator.h" -#include "cuda/communication/GPUPackInfo.h" -#include "core/timing/TimingPool.h" -#include "core/timing/RemainingTimeLogger.h" -#include "cuda/AddGPUFieldToStorage.h" -#include "cuda/communication/UniformGPUScheme.h" -#include "lbm/sweeps/CellwiseSweep.h" -#include "domain_decomposition/SharedSweep.h" - -#include "EquivalenceTest_LatticeModel.h" -#include "EquivalenceTest_GPUKernel.h" -#include "EquivalenceTest_GPUPackInfo.h" - -using namespace walberla; - -using NativeLatticeModel_T = lbm::D3Q19<lbm::collision_model::SRT, false>; -using GeneratedLatticeModel_T = lbm::EquivalenceTest_LatticeModel; - -using Stencil_T = GeneratedLatticeModel_T::Stencil; -using CommunicationStencil_T = GeneratedLatticeModel_T::CommunicationStencil; -using NativePdfField_T = lbm::PdfField<NativeLatticeModel_T>; -using GeneratedPdfField_T = lbm::PdfField<GeneratedLatticeModel_T>; - -using flag_t = walberla::uint8_t; -using FlagField_T = FlagField<flag_t>; - -using CpuCommScheme_T = blockforest::communication::UniformBufferedScheme<CommunicationStencil_T>; -using GpuCommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; - - -template<typename PdfField_T> -void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfFieldId ) -{ - auto domainBB = blocks->getDomainCellBB(); - - for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) - { - auto pdfField = blockIt->getData<PdfField_T>( pdfFieldId ); - Cell offset( 0, 0, 0 ); - blocks->transformBlockLocalToGlobalCell( offset, *blockIt ); - - WALBERLA_FOR_ALL_CELLS_XYZ( pdfField, - auto globalX = real_c( offset[0] + x ); - auto globalZ = real_c( offset[2] + z ); - auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) )); - auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) )); - pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( real_t(0.05) * std::sin(xArg), 0, - real_t(0.05) * std::cos(zArg))); - ); - } -} - - -int main( int argc, char **argv ) -{ - mpi::Environment env( argc, argv ); - - for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) - { - auto config = *cfg; - auto parameters = config->getOneBlock( "Parameters" ); - - auto blocks = blockforest::createUniformBlockGridFromConfig( config ); - - const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); - const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); - - // Boundary - BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" ); - const FlagUID fluidFlagUID( "Fluid" ); - geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID ); - GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega ); - - - // Part 1 : Native walberla - NativeLatticeModel_T nativeLatticeModel = NativeLatticeModel_T( lbm::collision_model::SRT( omega )); - BlockDataID pdfFieldNativeId = lbm::addPdfFieldToStorage( blocks, "pdfNative", nativeLatticeModel, field::fzyx ); - initPdfField<NativePdfField_T >( blocks, pdfFieldNativeId ); - CpuCommScheme_T nativeComm( blocks ); - nativeComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< NativeLatticeModel_T > >( pdfFieldNativeId ) ); - auto nativeSweep = lbm::makeCellwiseSweep< NativeLatticeModel_T , FlagField_T >( pdfFieldNativeId, flagFieldId, fluidFlagUID ); - - SweepTimeloop nativeTimeLoop( blocks->getBlockStorage(), timesteps ); - nativeTimeLoop.add() << BeforeFunction( nativeComm, "communication" ) - << Sweep(makeSharedSweep(nativeSweep), "native stream collide" ); - nativeTimeLoop.run(); - - - // Part 2: Generated CPU Version - BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx ); - initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId ); - CpuCommScheme_T cpuComm( blocks ); - cpuComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< GeneratedLatticeModel_T > >( pdfFieldGeneratedId ) ); - SweepTimeloop cpuTimeLoop( blocks->getBlockStorage(), timesteps ); - cpuTimeLoop.add() << BeforeFunction( cpuComm, "communication" ) - << Sweep(GeneratedLatticeModel_T::Sweep( pdfFieldGeneratedId ), "generated stream collide on cpu" ); - cpuTimeLoop.run(); - - - // Part 3: Generated GPU Version - bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); - bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); - bool oldCommunication = parameters.getParameter<bool>( "oldCommunication", false ); - - BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx ); - initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU ); - - BlockDataID pdfGpuFieldId = cuda::addGPUFieldToStorage<GeneratedPdfField_T >( blocks, pdfShadowCPU, "pdfs on gpu", true ); - auto defaultKernelStream = overlapCommunication ? cuda::StreamRAII::newStream() : cuda::StreamRAII::defaultStream(); - auto innerKernelStartedEvent = make_shared<cuda::EventRAII>(); - - pystencils::EquivalenceTest_GPUKernel cudaLbKernel( pdfGpuFieldId, omega, defaultKernelStream ); - GpuCommScheme_T gpuComm( blocks, innerKernelStartedEvent, cudaEnabledMPI ); - gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId )); - auto runCommunication = [&]() { gpuComm(); }; - - CpuCommScheme_T oldGpuScheme( blocks ); - - std::vector<cudaStream_t > streams; - for(uint_t i=0; i < Stencil_T::Size; ++i ) { - cudaStream_t s; - cudaStreamCreate(&s); - streams.push_back(s); - } - using OldPackInfo = cuda::communication::GPUPackInfo<cuda::GPUField<real_t> >; - oldGpuScheme.addPackInfo( make_shared<OldPackInfo>(pdfGpuFieldId, streams) ); - - - SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps ); - if( !overlapCommunication ) - { - gpuTimeLoop.add() << (oldCommunication ? BeforeFunction(oldGpuScheme) : - BeforeFunction( runCommunication, "gpu communication" )) - << Sweep( cudaLbKernel, "LB stream & collide gpu" ); - } - else - { - gpuTimeLoop.add() << Sweep( [&]( IBlock *b ) - { - cudaEventRecord( *innerKernelStartedEvent, defaultKernelStream ); - cudaLbKernel.inner( b ); - }, "LBM @ inner" ); - gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" ) - << Sweep( [&]( IBlock *b ) { cudaLbKernel.outer( b ); }, "LBM @ outer" ); - } - gpuTimeLoop.run(); - cuda::fieldCpy<GeneratedPdfField_T, cuda::GPUField<real_t>> (blocks, pdfShadowCPU, pdfGpuFieldId); - - // Compare all three versions - auto errorCPU = real_t(0); - auto errorGPU = real_t(0); - - for( auto & block : *blocks ) - { - auto native = block.getData<NativePdfField_T>( pdfFieldNativeId ); - auto cpu = block.getData<GeneratedPdfField_T >( pdfFieldGeneratedId ); - auto gpu = block.getData<GeneratedPdfField_T>( pdfShadowCPU ); - - WALBERLA_FOR_ALL_CELLS_XYZ(native, - for(cell_idx_t f = 0; f < cell_idx_c(NativeLatticeModel_T::Stencil::Q); ++f ) - { - errorCPU += std::abs( native->get( x, y, z, f ) - cpu->get( x, y, z, f )); - errorGPU += std::abs( native->get( x, y, z, f ) - gpu->get( x, y, z, f )); - } - ) - } - mpi::reduceInplace(errorCPU, mpi::SUM); - mpi::reduceInplace(errorGPU, mpi::SUM); - auto domainBB = blocks->getDomainCellBB(); - errorCPU /= real_c(domainBB.numCells()); - errorGPU /= real_c(domainBB.numCells()); - WALBERLA_LOG_RESULT_ON_ROOT("CPU Error " << errorCPU ); - WALBERLA_LOG_RESULT_ON_ROOT("GPU Error " << errorGPU ); - WALBERLA_CHECK_FLOAT_EQUAL(errorCPU, real_c(0.0)); - WALBERLA_CHECK_FLOAT_EQUAL(errorGPU, real_c(0.0)); - } - - return 0; -} \ No newline at end of file diff --git a/tests/cuda/codegen/EquivalenceTest.gen.py b/tests/cuda/codegen/EquivalenceTest.gen.py deleted file mode 100644 index 43140ca53ee3396456d6c17d591b4e1cf0e2deb0..0000000000000000000000000000000000000000 --- a/tests/cuda/codegen/EquivalenceTest.gen.py +++ /dev/null @@ -1,42 +0,0 @@ -import sympy as sp -from lbmpy_walberla import generate_lattice_model_files -from lbmpy.creationfunctions import create_lb_update_rule -from pystencils_walberla.sweep import Sweep - -dtype = 'float64' - -# LB options -options = { - 'method': 'srt', - 'stencil': 'D3Q19', - 'relaxation_rate': sp.Symbol("omega"), - 'field_name': 'pdfs', - 'compressible': False, - 'maxwellian_moments': False, - 'temporary_field_name': 'pdfs_tmp', - 'optimization': {'cse_global': True, - 'cse_pdfs': True, - 'double_precision': dtype == 'float64'} -} - -# GPU optimization options -opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} -outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype} - - -def lb_assignments(): - ur = create_lb_update_rule(**options) - return ur.all_assignments - - -generate_lattice_model_files(class_name='EquivalenceTest_LatticeModel', **options) - -Sweep.generate_inner_outer_kernel('EquivalenceTest_GPUKernel', - lambda: create_lb_update_rule(**options).all_assignments, - target='gpu', - temporary_fields=['pdfs_tmp'], - field_swaps=[('pdfs', 'pdfs_tmp')], - optimization=opt, - outer_optimization=outer_opt) - -Sweep.generate_pack_info('EquivalenceTest_GPUPackInfo', lb_assignments, target='gpu') diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py index b722f0510b95e9b8e01d6bddd5947ef97f2a74c9..298727b46c428384eeef7f755e8bfe4881d53d60 100644 --- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py +++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py @@ -1,27 +1,22 @@ import pystencils as ps -from pystencils_walberla.sweep import Sweep from lbmpy.updatekernels import create_stream_pull_only_kernel from lbmpy.stencils import get_stencil +from pystencils_walberla import CodeGeneration, generate_sweep -dtype = 'float64' -f_size = 19 +with CodeGeneration() as ctx: + f_size = 19 + dtype = 'float64' if ctx.double_accuracy else 'float32' - -def copy_only(): + # Copy sweep src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size), layout='fzyx') - return [ps.Assignment(dst(i), src(i)) for i in range(f_size)] - + copy_only = [ps.Assignment(dst(i), src(i)) for i in range(f_size)] + generate_sweep(ctx, 'MicroBenchmarkCopyKernel', copy_only, + target='gpu', gpu_indexing_params={'block_size': (128, 1, 1)}) -def stream_only(): + # Stream-only sweep stencil = get_stencil("D3Q19") - return create_stream_pull_only_kernel(stencil, src_field_name='src', - dst_field_name='dst', - generic_field_type=dtype, - generic_layout='fzyx') - - -opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} - -Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt) -Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt) + stream_only = create_stream_pull_only_kernel(stencil, src_field_name='src', dst_field_name='dst', + generic_field_type=dtype, generic_layout='fzyx') + generate_sweep(ctx, 'MicroBenchmarkStreamKernel', stream_only, + target='gpu', gpu_indexing_params={'block_size': (128, 1, 1)}) diff --git a/tests/field/CMakeLists.txt b/tests/field/CMakeLists.txt index e22e9fe331e54008f42d328b799123c3ea5b3960..57114a6d1acda0386de2cdf7da480b707f244624 100644 --- a/tests/field/CMakeLists.txt +++ b/tests/field/CMakeLists.txt @@ -60,6 +60,9 @@ endif( WALBERLA_BUILD_WITH_MPI ) # CodeGen Tests +waLBerla_python_file_generates(codegen/JacobiKernel.py + JacobiKernel2D.cpp JacobiKernel2D.h + JacobiKernel3D.cpp JacobiKernel3D.h) waLBerla_compile_test( FILES codegen/CodegenJacobiCPU.cpp codegen/JacobiKernel.py DEPENDS gui timeloop ) waLBerla_execute_test( NAME CodegenJacobiCPU ) diff --git a/tests/field/codegen/JacobiKernel.py b/tests/field/codegen/JacobiKernel.py index bcdc4c72e5f3999fb66ccc98bc827fbeb9991eac..b375d5447c4009a0527580853c242c545a6a2e71 100644 --- a/tests/field/codegen/JacobiKernel.py +++ b/tests/field/codegen/JacobiKernel.py @@ -1,16 +1,26 @@ -from pystencils_walberla.sweep import Sweep +import sympy as sp +import pystencils as ps +from pystencils_walberla import CodeGeneration, generate_sweep -def jacobi2D(sweep): - src = sweep.field("f1") - dst = sweep.temporaryField(src) - dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * S.h ** 2) +with CodeGeneration() as ctx: + h = sp.symbols("h") -def jacobi3D(sweep): - src = sweep.field("f1") - dst = sweep.temporaryField(src) + # ----- Jacobi 2D - created by specifying weights in nested list -------------------------- + src, dst = ps.fields("src, src_tmp: [2D]") + stencil = [[0, -1, 0], + [-1, 4, -1], + [0, -1, 0]] + assignments = ps.assignment_from_stencil(stencil, src, dst, normalization_factor=4 * h**2) + generate_sweep(ctx, 'JacobiKernel2D', assignments, field_swaps=[(src, dst)]) - dst[0,0,0] @= (src[1,0,0] + src[-1,0,0] + src[0,1,0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0 , -1] ) / (6 * S.h**2) + # ----- Jacobi 3D - created by using kernel_decorator with assignments in '@=' format ----- + src, dst = ps.fields("src, src_tmp: [3D]") -Sweep.generate('JacobiKernel2D', jacobi2D, dim=2) -Sweep.generate('JacobiKernel3D', jacobi3D, dim=3) \ No newline at end of file + @ps.kernel + def kernel_func(): + dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + + src[0, 1, 0] + src[0, -1, 0] + + src[0, 0, 1] + src[0, 0, -1]) / (6 * h ** 2) + + generate_sweep(ctx, 'JacobiKernel3D', kernel_func, field_swaps=[(src, dst)]) diff --git a/tests/lbm/CMakeLists.txt b/tests/lbm/CMakeLists.txt index 6593cbabf44ae16398df21d79048d637bcaa47aa..795636733fa71b846d3e954429169e6f1e0c28a1 100644 --- a/tests/lbm/CMakeLists.txt +++ b/tests/lbm/CMakeLists.txt @@ -64,5 +64,9 @@ waLBerla_execute_test( NAME PdfFieldInitializerTest COMMAND $<TARGET_FILE:PdfFie # Code Generation -waLBerla_compile_test( FILES codegen/SrtWithForceFieldModel.gen.py +waLBerla_python_file_generates(codegen/SrtWithForceFieldModel.py + SrtWithForceFieldModel.cpp SrtWithForceFieldModel.h + MyNoSlip.cpp MyNoSlip.h + MyUBB.cpp MyUBB.h) +waLBerla_compile_test( FILES codegen/SrtWithForceFieldModel.py codegen/SrtWithForceField.cpp ) diff --git a/tests/lbm/codegen/SrtWithForceFieldModel.gen.py b/tests/lbm/codegen/SrtWithForceFieldModel.gen.py deleted file mode 100644 index 72e10eb354141be9d2e2ec55438a22d27d8bffe5..0000000000000000000000000000000000000000 --- a/tests/lbm/codegen/SrtWithForceFieldModel.gen.py +++ /dev/null @@ -1,36 +0,0 @@ -import sympy as sp -from lbmpy.boundaries import NoSlip, UBB -from lbmpy_walberla import generate_lattice_model_files, RefinementScaling -from lbmpy.creationfunctions import create_lb_method -from lbmpy_walberla.boundary import create_boundary_class -from pystencils_walberla.cmake_integration import codegen -import pystencils as ps - -# ------------- Lattice Model ------------------------------ -force_field = ps.fields("force(3): [3D]", layout='fzyx') - -omega = sp.Symbol("omega") - -scaling = RefinementScaling() -scaling.add_standard_relaxation_rate_scaling(omega) -scaling.add_force_scaling(force_field) - -generate_lattice_model_files(class_name='SrtWithForceFieldModel', - method='srt', stencil='D3Q19', force_model='guo', force=force_field.center_vector, - relaxation_rates=[omega], refinement_scaling=scaling) - - -def genBoundary(): - boundary = UBB([0.05, 0, 0], dim=3, name="MyUBB") - method = create_lb_method(stencil='D3Q19', method='srt') - return create_boundary_class(boundary, method) - - -def genNoSlip(): - boundary = NoSlip(name='MyNoSlip') - method = create_lb_method(stencil='D3Q19', method='srt') - return create_boundary_class(boundary, method) - - -codegen.register(['MyUBB.h', 'MyUBB.cpp'], genBoundary) -codegen.register(['MyNoSlip.h', 'MyNoSlip.cpp'], genNoSlip) diff --git a/tests/lbm/codegen/SrtWithForceFieldModel.py b/tests/lbm/codegen/SrtWithForceFieldModel.py new file mode 100644 index 0000000000000000000000000000000000000000..f68ec173ffeb81a4c36c70016b18fa703a4aa664 --- /dev/null +++ b/tests/lbm/codegen/SrtWithForceFieldModel.py @@ -0,0 +1,24 @@ +import sympy as sp +import pystencils as ps +from lbmpy.creationfunctions import create_lb_method +from lbmpy.boundaries import NoSlip, UBB +from pystencils_walberla import CodeGeneration +from lbmpy_walberla import generate_lattice_model, RefinementScaling, generate_boundary + + +with CodeGeneration() as ctx: + omega = sp.Symbol("omega") + force_field = ps.fields("force(3): [3D]", layout='fzyx') + + # lattice Boltzmann method + lb_method = create_lb_method(stencil='D3Q19', method='srt', relaxation_rates=[omega], + force_model='guo', force=force_field.center_vector) + + scaling = RefinementScaling() + scaling.add_standard_relaxation_rate_scaling(omega) + scaling.add_force_scaling(force_field) + + # generate components + generate_lattice_model(ctx, 'SrtWithForceFieldModel', lb_method, refinement_scaling=scaling) + generate_boundary(ctx, 'MyUBB', UBB([0.05, 0, 0]), lb_method) + generate_boundary(ctx, 'MyNoSlip', NoSlip(), lb_method)