diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 0698ea06c8016d770db8ef29b690d5bde07e8e54..74d5e5c714ff810f60a454f7062c230635e928c2 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -2,5 +2,7 @@ waLBerla_link_files_to_builddir( "*.prm" ) waLBerla_add_executable ( NAME UniformGridBenchmarkGPU - FILES UniformGridGPU.cpp UniformGridGPU.gen.py + FILES UniformGridGPU.cpp UniformGridGPU_LatticeModel.cpp + UniformGridGPU_LbKernel.cu UniformGridGPU_NoSlip.cu UniformGridGPU_UBB.cu + UniformGridGPU_PackInfo.cu DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk ) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index 53a3c051288f666379b4055e9e57e1b90827730b..2b340e0ef9434cc9fdc60c8bae2e643bb96670fc 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -2,7 +2,7 @@ Parameters { omega 1.8; - timesteps 10; + timesteps 1000; remainingTimeLoggerFrequency 3; vtkWriteFrequency 0; diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f7a55cd5b3715a8c3c96f65fbd5b648caf38fee --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.cpp @@ -0,0 +1,552 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\author Martin Bauer <martin.bauer@fau.de> +//====================================================================================================================== + +#include <cmath> + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "lbm/field/PdfField.h" +#include "lbm/sweeps/Streaming.h" +#include "UniformGridGPU_LatticeModel.h" + +#ifdef _MSC_VER +# pragma warning( disable : 4458 ) +#endif + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +namespace internal_kernel_streamCollide { +static FUNC_PREFIX void kernel_streamCollide(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) +{ + const double xi_1 = omega*0.166666666666667; + const double xi_5 = omega*0.0416666666666667; + for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) + { + double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2; + double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) + { + double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_30; + double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_31; + double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_32; + double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_33; + double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_34; + double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_35; + double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_36; + double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_37; + double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_38; + double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_39; + double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_310; + double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_311; + double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_312; + double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_313; + double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_314; + double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_315; + double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_316; + double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_317; + double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_tmp_20_318; + for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) + { + const double xi_18 = -_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double xi_19 = -_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double xi_20 = -_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double xi_27 = rho*-0.333333333333333; + const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double xi_23 = (u_0*u_0); + const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + const double xi_21 = -u_1; + const double xi_24 = (u_1*u_1); + const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double xi_22 = -u_2; + const double xi_25 = (u_2*u_2); + const double u0Mu1 = u_0 + xi_21; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + xi_22; + const double u0Mu2 = u_0 + xi_22; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = rho - xi_23 - xi_24 - xi_25; + const double xi_26 = f_eq_common + rho*-0.666666666666667; + const double xi_28 = f_eq_common + xi_25 + xi_27; + const double xi_29 = f_eq_common + xi_23 + xi_27; + const double xi_30 = f_eq_common + xi_24 + xi_27; + const double xi_2 = xi_24*2 + xi_26; + const double xi_3 = xi_23*2 + xi_26; + const double xi_4 = xi_25*2 + xi_26; + const double xi_6 = u0Mu1*2; + const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; + const double xi_8 = u0Pu1*2; + const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; + const double xi_10 = u1Pu2*2; + const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; + const double xi_12 = u1Mu2*2; + const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; + const double xi_14 = u0Mu2*2; + const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; + const double xi_16 = u0Pu2*2; + const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; + _data_pdfs_tmp_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_31_10[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_32_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_33_10[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_34_10[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_35_10[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_36_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_37_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_38_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_39_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_310_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_311_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_312_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_313_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_314_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_315_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_316_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_317_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_318_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + } + } + } +} +} +namespace internal_kernel_collide { +static FUNC_PREFIX void kernel_collide(double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) +{ + const double xi_1 = omega*0.166666666666667; + const double xi_5 = omega*0.0416666666666667; + for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) + { + double * _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) + { + double * _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) + { + const double xi_18 = -_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]; + const double xi_19 = -_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double xi_20 = -_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]; + const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double xi_27 = rho*-0.333333333333333; + const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double xi_23 = (u_0*u_0); + const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double xi_21 = -u_1; + const double xi_24 = (u_1*u_1); + const double u_2 = vel2Term + xi_18 + xi_20 + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]; + const double xi_22 = -u_2; + const double xi_25 = (u_2*u_2); + const double u0Mu1 = u_0 + xi_21; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + xi_22; + const double u0Mu2 = u_0 + xi_22; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = rho - xi_23 - xi_24 - xi_25; + const double xi_26 = f_eq_common + rho*-0.666666666666667; + const double xi_28 = f_eq_common + xi_25 + xi_27; + const double xi_29 = f_eq_common + xi_23 + xi_27; + const double xi_30 = f_eq_common + xi_24 + xi_27; + const double xi_2 = xi_24*2 + xi_26; + const double xi_3 = xi_23*2 + xi_26; + const double xi_4 = xi_25*2 + xi_26; + const double xi_6 = u0Mu1*2; + const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; + const double xi_8 = u0Pu1*2; + const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; + const double xi_10 = u1Pu2*2; + const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; + const double xi_12 = u1Mu2*2; + const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; + const double xi_14 = u0Mu2*2; + const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; + const double xi_16 = u0Pu2*2; + const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]; + } + } + } +} +} +namespace internal_kernel_stream { +static FUNC_PREFIX void kernel_stream(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) +{ + for (int ctr_2 = 1; ctr_2 < _size_pdfs_2 - 1; ctr_2 += 1) + { + double * _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * const _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * const _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * const _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * const _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * const _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * const _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * const _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * const _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * const _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * const _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * const _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * const _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + double * const _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + for (int ctr_1 = 1; ctr_1 < _size_pdfs_1 - 1; ctr_1 += 1) + { + double * _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * const _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * const _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * const _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * const _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * const _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * const _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * const _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * const _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * const _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * const _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * const _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * const _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * const _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * const _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * const _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * const _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * const _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * const _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + double * const _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + for (int ctr_0 = 1; ctr_0 < _size_pdfs_0 - 1; ctr_0 += 1) + { + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + } + } + } +} +} + + +const real_t UniformGridGPU_LatticeModel::w[19] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; +const real_t UniformGridGPU_LatticeModel::wInv[19] = { 3.00000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000 }; + +void UniformGridGPU_LatticeModel::Sweep::streamCollide( IBlock * block, const uint_t numberOfGhostLayersToInclude ) +{ + auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); + GhostLayerField<double, 19> * pdfs_tmp; + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + + + auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel(); + lm.configureBlock(block); + + auto & omega = lm.omega; + WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + internal_kernel_streamCollide::kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); + pdfs->swapDataPointers(pdfs_tmp); + +} + +void UniformGridGPU_LatticeModel::Sweep::collide( IBlock * block, const uint_t numberOfGhostLayersToInclude ) +{ + auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); + + + auto & lm = dynamic_cast< lbm::PdfField<UniformGridGPU_LatticeModel> * > (pdfs)->latticeModel(); + lm.configureBlock(block); + + auto & omega = lm.omega; + WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + internal_kernel_collide::kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} + + +void UniformGridGPU_LatticeModel::Sweep::stream( IBlock * block, const uint_t numberOfGhostLayersToInclude ) +{ + auto pdfs = block->getData< GhostLayerField<double, 19> >(pdfsID); + GhostLayerField<double, 19> * pdfs_tmp; + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + + + WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + double * _data_pdfs_tmp = pdfs_tmp->dataAt(-cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, -cell_idx_c(numberOfGhostLayersToInclude) - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2)); + const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2*cell_idx_c(numberOfGhostLayersToInclude) + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(pdfs_tmp->fStride()); + internal_kernel_stream::kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); + + pdfs->swapDataPointers(pdfs_tmp); + +} + + +} // namespace lbm +} // namespace walberla + + + + +// Buffer Packing + +namespace walberla { +namespace mpi { + +mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm) +{ + buf << lm.currentLevel; + return buf; +} + +mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::lbm::UniformGridGPU_LatticeModel & lm) +{ + buf >> lm.currentLevel; + return buf; +} + + +} // namespace mpi +} // namespace walberla diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h new file mode 100644 index 0000000000000000000000000000000000000000..f48737a156a49ce4e2b60f7241a4aaafd581ff1c --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LatticeModel.h @@ -0,0 +1,737 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#include "core/DataTypes.h" +#include "core/logging/Logging.h" + +#include "field/GhostLayerField.h" +#include "field/SwapableCompare.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "stencil/D3Q19.h" + +#include "lbm/lattice_model/EquilibriumDistribution.h" +#include "lbm/field/Density.h" +#include "lbm/field/DensityAndMomentumDensity.h" +#include "lbm/field/DensityAndVelocity.h" +#include "lbm/field/PressureTensor.h" +#include "lbm/field/ShearRate.h" + +#include <set> + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#ifdef WALBERLA_CXX_COMPILER_IS_GNU +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + + + + + +// Forward declarations +namespace walberla{ +namespace lbm { + class UniformGridGPU_LatticeModel; +}} +namespace walberla { +namespace mpi { + mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::lbm::UniformGridGPU_LatticeModel & lm); + mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::lbm::UniformGridGPU_LatticeModel & lm); +}} + + + + +namespace walberla { +namespace lbm { + + +/** +UniformGridGPU_LatticeModel was generated with lbmpy. Do not edit this file directly. Instead modify UniformGridGPU_LatticeModel.py. +For details see documentation of lbmpy. + +Usage: + - Create an instance of this lattice model class: the constructor parameters vary depending on the configure + lattice model. A model with constant force needs a single force vector, while a model with variable forces needs + a force field. All constructor parameters are ordered alphabetically. + - Create a PDFField with the lattice model as template argument to store the particle distribution functions. + Use the PDFField to get and modify macroscopic values. + - The internal class UniformGridGPU_LatticeModel::Sweep is a functor to execute one LB time step. + Stream, collide steps can be executed separately, or together in an optimized stream-pull-collide scheme + +*/ +class UniformGridGPU_LatticeModel +{ + +public: + typedef stencil::D3Q19 Stencil; + typedef stencil::D3Q19 CommunicationStencil; + static const real_t w[19]; + static const real_t wInv[19]; + + static const bool compressible = false; + static const int equilibriumAccuracyOrder = 2; + + class Sweep + { + public: + Sweep( BlockDataID _pdfsID ) : pdfsID(_pdfsID) {}; + + //void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); + void collide ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); + void streamCollide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); + void stream ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ); + + void operator() ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) ) + { + streamCollide( block, numberOfGhostLayersToInclude ); + } + + private: + BlockDataID pdfsID; + + std::set< GhostLayerField<double, 19> *, field::SwapableCompare< GhostLayerField<double, 19> * > > cache_pdfs_; + }; + + UniformGridGPU_LatticeModel( double omega_ ) + : omega(omega_), currentLevel(0) + {}; + + void configure( IBlock & block, StructuredBlockStorage &) { configureBlock( &block ); } + +private: + void configureBlock(IBlock * block) + { + + + + } + + // Parameters: + double omega; + + // Updated by configureBlock: + + + uint_t currentLevel; + + // Backend classes can access private members: + friend class UniformGridGPU_LatticeModel::Sweep; + template<class LM, class Enable> friend class EquilibriumDistribution; + template<class LM, class Enable> friend struct Equilibrium; + template<class LM, class Enable> friend struct internal::AdaptVelocityToForce; + template<class LM, class Enable> friend struct Density; + template<class LM> friend struct DensityAndVelocity; + template<class LM, class Enable> friend struct DensityAndMomentumDensity; + template<class LM, class Enable> friend struct MomentumDensity; + template<class LM, class It, class Enable> friend struct DensityAndVelocityRange; + + friend mpi::SendBuffer & ::walberla::mpi::operator<< (mpi::SendBuffer & , const UniformGridGPU_LatticeModel & ); + friend mpi::RecvBuffer & ::walberla::mpi::operator>> (mpi::RecvBuffer & , UniformGridGPU_LatticeModel & ); + +}; + + + + +//====================================================================================================================== +// +// Implementation of macroscopic value backend +// +//====================================================================================================================== + + + +template<> +class EquilibriumDistribution< UniformGridGPU_LatticeModel, void> +{ +public: + typedef typename UniformGridGPU_LatticeModel::Stencil Stencil; + + static real_t get( const stencil::Direction direction, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), + real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + using namespace stencil; + switch( direction ) { + case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); + case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; + case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); + case W: return rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); + case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; + case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; + case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); + case NW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; + case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; + case SW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; + case SE: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); + case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; + case TS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + case TW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; + case BN: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); + case BS: return rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; + case BW: return rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; + case BE: return rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); + default: + WALBERLA_ABORT("Invalid Direction"); + } + + } + + static real_t getSymmetricPart( const stencil::Direction direction, + const Vector3<real_t> & u = Vector3< real_t >(real_t(0.0)), + real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + using namespace stencil; + switch( direction ) { + case C: return rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); + case N: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); + case S: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); + case W: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); + case E: return rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); + case T: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]); + case B: return rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]); + case NW: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]); + case NE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; + case SW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; + case SE: return rho*0.0277777777777778 - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]); + case TN: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; + case TS: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]); + case TW: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]); + case TE: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; + case BN: return rho*0.0277777777777778 - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]); + case BS: return rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; + case BW: return rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; + case BE: return rho*0.0277777777777778 - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]); + default: + WALBERLA_ABORT("Invalid Direction"); + } + + } + + static real_t getAsymmetricPart( const stencil::Direction direction, + const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ), + real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + using namespace stencil; + switch( direction ) { + case C: return 0; + case N: return 0.166666666666667*u[1]; + case S: return -0.166666666666667*u[1]; + case W: return -0.166666666666667*u[0]; + case E: return 0.166666666666667*u[0]; + case T: return 0.166666666666667*u[2]; + case B: return -0.166666666666667*u[2]; + case NW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[1]; + case NE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[1]; + case SW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[1]; + case SE: return -0.0833333333333333*u[1] + 0.0833333333333333*u[0]; + case TN: return 0.0833333333333333*u[1] + 0.0833333333333333*u[2]; + case TS: return -0.0833333333333333*u[1] + 0.0833333333333333*u[2]; + case TW: return -0.0833333333333333*u[0] + 0.0833333333333333*u[2]; + case TE: return 0.0833333333333333*u[0] + 0.0833333333333333*u[2]; + case BN: return -0.0833333333333333*u[2] + 0.0833333333333333*u[1]; + case BS: return -0.0833333333333333*u[1] - 0.0833333333333333*u[2]; + case BW: return -0.0833333333333333*u[0] - 0.0833333333333333*u[2]; + case BE: return -0.0833333333333333*u[2] + 0.0833333333333333*u[0]; + default: + WALBERLA_ABORT("Invalid Direction"); + } + + } + + static std::vector< real_t > get( const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ), + real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + std::vector< real_t > equilibrium( Stencil::Size ); + for( auto d = Stencil::begin(); d != Stencil::end(); ++d ) + { + equilibrium[d.toIdx()] = get(*d, u, rho); + } + return equilibrium; + } +}; + + +namespace internal { + +template<> +struct AdaptVelocityToForce<UniformGridGPU_LatticeModel, void> +{ + template< typename FieldPtrOrIterator > + static Vector3<real_t> get( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm, + const Vector3< real_t > & velocity, const real_t rho ) + { + auto x = it.x(); + auto y = it.y(); + auto z = it.z(); + + return velocity; + + } + + static Vector3<real_t> get( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm, + const Vector3< real_t > & velocity, const real_t rho ) + { + + return velocity; + + } +}; +} // namespace internal + + + +template<> +struct Equilibrium< UniformGridGPU_LatticeModel, void > +{ + + template< typename FieldPtrOrIterator > + static void set( FieldPtrOrIterator & it, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + it[0] = rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); + it[1] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; + it[2] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); + it[3] = rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); + it[4] = rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; + it[5] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; + it[6] = rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); + it[7] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; + it[8] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; + it[9] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; + it[10] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); + it[11] = rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; + it[12] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + it[13] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + it[14] = rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; + it[15] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); + it[16] = rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; + it[17] = rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; + it[18] = rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); + } + + template< typename PdfField_T > + static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) ) + { + + rho -= real_t(1.0); + + + real_t & xyz0 = pdf(x,y,z,0); + pdf.getF( &xyz0, 0)= rho*0.333333333333333 - 0.333333333333333*(u[0]*u[0]) - 0.333333333333333*(u[1]*u[1]) - 0.333333333333333*(u[2]*u[2]); + pdf.getF( &xyz0, 1)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*u[1]; + pdf.getF( &xyz0, 2)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*u[1] - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[1]*u[1]); + pdf.getF( &xyz0, 3)= rho*0.0555555555555556 - 0.166666666666667*u[0] - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]); + pdf.getF( &xyz0, 4)= rho*0.0555555555555556 - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*(u[0]*u[0]) + 0.166666666666667*u[0]; + pdf.getF( &xyz0, 5)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) + 0.166666666666667*(u[2]*u[2]) + 0.166666666666667*u[2]; + pdf.getF( &xyz0, 6)= rho*0.0555555555555556 - 0.166666666666667*(u[0]*u[0]) - 0.166666666666667*(u[1]*u[1]) - 0.166666666666667*u[2] + 0.166666666666667*(u[2]*u[2]); + pdf.getF( &xyz0, 7)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1]; + pdf.getF( &xyz0, 8)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.25*u[0]*u[1]; + pdf.getF( &xyz0, 9)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[1]*u[1]) + 0.25*u[0]*u[1]; + pdf.getF( &xyz0, 10)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[0]*u[1] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[1]*u[1]); + pdf.getF( &xyz0, 11)= rho*0.0277777777777778 + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[1]*u[2]; + pdf.getF( &xyz0, 12)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + pdf.getF( &xyz0, 13)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2]; + pdf.getF( &xyz0, 14)= rho*0.0277777777777778 + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]) + 0.0833333333333333*u[2] + 0.25*u[0]*u[2]; + pdf.getF( &xyz0, 15)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[1]*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*u[1] + 0.0833333333333333*(u[2]*u[2]); + pdf.getF( &xyz0, 16)= rho*0.0277777777777778 - 0.0833333333333333*u[1] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[1]*u[1]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[1]*u[2]; + pdf.getF( &xyz0, 17)= rho*0.0277777777777778 - 0.0833333333333333*u[0] - 0.0833333333333333*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*(u[2]*u[2]) + 0.25*u[0]*u[2]; + pdf.getF( &xyz0, 18)= rho*0.0277777777777778 - 0.0833333333333333*u[2] - 0.25*u[0]*u[2] + 0.0833333333333333*(u[0]*u[0]) + 0.0833333333333333*u[0] + 0.0833333333333333*(u[2]*u[2]); + } +}; + + +template<> +struct Density<UniformGridGPU_LatticeModel, void> +{ + template< typename FieldPtrOrIterator > + static inline real_t get( const UniformGridGPU_LatticeModel & , const FieldPtrOrIterator & it ) + { + const real_t f_0 = it[0]; + const real_t f_1 = it[1]; + const real_t f_2 = it[2]; + const real_t f_3 = it[3]; + const real_t f_4 = it[4]; + const real_t f_5 = it[5]; + const real_t f_6 = it[6]; + const real_t f_7 = it[7]; + const real_t f_8 = it[8]; + const real_t f_9 = it[9]; + const real_t f_10 = it[10]; + const real_t f_11 = it[11]; + const real_t f_12 = it[12]; + const real_t f_13 = it[13]; + const real_t f_14 = it[14]; + const real_t f_15 = it[15]; + const real_t f_16 = it[16]; + const real_t f_17 = it[17]; + const real_t f_18 = it[18]; + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + return rho; + } + + template< typename PdfField_T > + static inline real_t get( const UniformGridGPU_LatticeModel & , + const PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) + { + const real_t & xyz0 = pdf(x,y,z,0); + const real_t f_0 = pdf.getF( &xyz0, 0); + const real_t f_1 = pdf.getF( &xyz0, 1); + const real_t f_2 = pdf.getF( &xyz0, 2); + const real_t f_3 = pdf.getF( &xyz0, 3); + const real_t f_4 = pdf.getF( &xyz0, 4); + const real_t f_5 = pdf.getF( &xyz0, 5); + const real_t f_6 = pdf.getF( &xyz0, 6); + const real_t f_7 = pdf.getF( &xyz0, 7); + const real_t f_8 = pdf.getF( &xyz0, 8); + const real_t f_9 = pdf.getF( &xyz0, 9); + const real_t f_10 = pdf.getF( &xyz0, 10); + const real_t f_11 = pdf.getF( &xyz0, 11); + const real_t f_12 = pdf.getF( &xyz0, 12); + const real_t f_13 = pdf.getF( &xyz0, 13); + const real_t f_14 = pdf.getF( &xyz0, 14); + const real_t f_15 = pdf.getF( &xyz0, 15); + const real_t f_16 = pdf.getF( &xyz0, 16); + const real_t f_17 = pdf.getF( &xyz0, 17); + const real_t f_18 = pdf.getF( &xyz0, 18); + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + return rho; + } +}; + + +template<> +struct DensityAndVelocity<UniformGridGPU_LatticeModel> +{ + template< typename FieldPtrOrIterator > + static void set( FieldPtrOrIterator & it, const UniformGridGPU_LatticeModel & lm, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) + { + auto x = it.x(); + auto y = it.y(); + auto z = it.z(); + + const double rho = rho_in - 1; + const double u_0 = u[0]; + const double u_1 = u[1]; + const double u_2 = u[2]; + + + Equilibrium<UniformGridGPU_LatticeModel>::set(it, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); + } + + template< typename PdfField_T > + static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const UniformGridGPU_LatticeModel & lm, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) + { + const double rho = rho_in - 1; + const double u_0 = u[0]; + const double u_1 = u[1]; + const double u_2 = u[2]; + + + Equilibrium<UniformGridGPU_LatticeModel>::set(pdf, x, y, z, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); + } +}; + + +template<typename FieldIteratorXYZ > +struct DensityAndVelocityRange<UniformGridGPU_LatticeModel, FieldIteratorXYZ> +{ + + static void set( FieldIteratorXYZ & begin, const FieldIteratorXYZ & end, const UniformGridGPU_LatticeModel & lm, + const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) ) + { + for( auto cellIt = begin; cellIt != end; ++cellIt ) + { + const auto x = cellIt.x(); + const auto y = cellIt.y(); + const auto z = cellIt.z(); + const double rho = rho_in - 1; + const double u_0 = u[0]; + const double u_1 = u[1]; + const double u_2 = u[2]; + + + Equilibrium<UniformGridGPU_LatticeModel>::set(cellIt, Vector3<real_t>(u_0, u_1, u_2), rho + real_t(1) ); + } + } +}; + + + +template<> +struct DensityAndMomentumDensity<UniformGridGPU_LatticeModel> +{ + template< typename FieldPtrOrIterator > + static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, + const FieldPtrOrIterator & it ) + { + const auto x = it.x(); + const auto y = it.y(); + const auto z = it.z(); + + const real_t f_0 = it[0]; + const real_t f_1 = it[1]; + const real_t f_2 = it[2]; + const real_t f_3 = it[3]; + const real_t f_4 = it[4]; + const real_t f_5 = it[5]; + const real_t f_6 = it[6]; + const real_t f_7 = it[7]; + const real_t f_8 = it[8]; + const real_t f_9 = it[9]; + const real_t f_10 = it[10]; + const real_t f_11 = it[11]; + const real_t f_12 = it[12]; + const real_t f_13 = it[13]; + const real_t f_14 = it[14]; + const real_t f_15 = it[15]; + const real_t f_16 = it[16]; + const real_t f_17 = it[17]; + const real_t f_18 = it[18]; + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; + const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; + const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; + momentumDensity[0] = md_0; + momentumDensity[1] = md_1; + momentumDensity[2] = md_2; + + return rho; + } + + template< typename PdfField_T > + static real_t get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf, + const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) + { + const real_t & xyz0 = pdf(x,y,z,0); + const real_t f_0 = pdf.getF( &xyz0, 0); + const real_t f_1 = pdf.getF( &xyz0, 1); + const real_t f_2 = pdf.getF( &xyz0, 2); + const real_t f_3 = pdf.getF( &xyz0, 3); + const real_t f_4 = pdf.getF( &xyz0, 4); + const real_t f_5 = pdf.getF( &xyz0, 5); + const real_t f_6 = pdf.getF( &xyz0, 6); + const real_t f_7 = pdf.getF( &xyz0, 7); + const real_t f_8 = pdf.getF( &xyz0, 8); + const real_t f_9 = pdf.getF( &xyz0, 9); + const real_t f_10 = pdf.getF( &xyz0, 10); + const real_t f_11 = pdf.getF( &xyz0, 11); + const real_t f_12 = pdf.getF( &xyz0, 12); + const real_t f_13 = pdf.getF( &xyz0, 13); + const real_t f_14 = pdf.getF( &xyz0, 14); + const real_t f_15 = pdf.getF( &xyz0, 15); + const real_t f_16 = pdf.getF( &xyz0, 16); + const real_t f_17 = pdf.getF( &xyz0, 17); + const real_t f_18 = pdf.getF( &xyz0, 18); + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; + const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; + const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; + momentumDensity[0] = md_0; + momentumDensity[1] = md_1; + momentumDensity[2] = md_2; + + return rho; + } +}; + + +template<> +struct MomentumDensity< UniformGridGPU_LatticeModel> +{ + template< typename FieldPtrOrIterator > + static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const FieldPtrOrIterator & it ) + { + const auto x = it.x(); + const auto y = it.y(); + const auto z = it.z(); + + const real_t f_0 = it[0]; + const real_t f_1 = it[1]; + const real_t f_2 = it[2]; + const real_t f_3 = it[3]; + const real_t f_4 = it[4]; + const real_t f_5 = it[5]; + const real_t f_6 = it[6]; + const real_t f_7 = it[7]; + const real_t f_8 = it[8]; + const real_t f_9 = it[9]; + const real_t f_10 = it[10]; + const real_t f_11 = it[11]; + const real_t f_12 = it[12]; + const real_t f_13 = it[13]; + const real_t f_14 = it[14]; + const real_t f_15 = it[15]; + const real_t f_16 = it[16]; + const real_t f_17 = it[17]; + const real_t f_18 = it[18]; + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; + const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; + const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; + momentumDensity[0] = md_0; + momentumDensity[1] = md_1; + momentumDensity[2] = md_2; + + } + + template< typename PdfField_T > + static void get( Vector3< real_t > & momentumDensity, const UniformGridGPU_LatticeModel & lm, const PdfField_T & pdf, + const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) + { + const real_t & xyz0 = pdf(x,y,z,0); + const real_t f_0 = pdf.getF( &xyz0, 0); + const real_t f_1 = pdf.getF( &xyz0, 1); + const real_t f_2 = pdf.getF( &xyz0, 2); + const real_t f_3 = pdf.getF( &xyz0, 3); + const real_t f_4 = pdf.getF( &xyz0, 4); + const real_t f_5 = pdf.getF( &xyz0, 5); + const real_t f_6 = pdf.getF( &xyz0, 6); + const real_t f_7 = pdf.getF( &xyz0, 7); + const real_t f_8 = pdf.getF( &xyz0, 8); + const real_t f_9 = pdf.getF( &xyz0, 9); + const real_t f_10 = pdf.getF( &xyz0, 10); + const real_t f_11 = pdf.getF( &xyz0, 11); + const real_t f_12 = pdf.getF( &xyz0, 12); + const real_t f_13 = pdf.getF( &xyz0, 13); + const real_t f_14 = pdf.getF( &xyz0, 14); + const real_t f_15 = pdf.getF( &xyz0, 15); + const real_t f_16 = pdf.getF( &xyz0, 16); + const real_t f_17 = pdf.getF( &xyz0, 17); + const real_t f_18 = pdf.getF( &xyz0, 18); + const double vel0Term = f_10 + f_14 + f_18 + f_4 + f_8; + const double vel1Term = f_1 + f_11 + f_15 + f_7; + const double vel2Term = f_12 + f_13 + f_5; + const double rho = f_0 + f_16 + f_17 + f_2 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term + 1; + const double md_0 = -f_13 - f_17 - f_3 - f_7 - f_9 + vel0Term; + const double md_1 = -f_10 - f_12 - f_16 - f_2 + f_8 - f_9 + vel1Term; + const double md_2 = f_11 + f_14 - f_15 - f_16 - f_17 - f_18 - f_6 + vel2Term; + momentumDensity[0] = md_0; + momentumDensity[1] = md_1; + momentumDensity[2] = md_2; + + } +}; + + +template<> +struct PressureTensor<UniformGridGPU_LatticeModel> +{ + template< typename FieldPtrOrIterator > + static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */ ) + { + WALBERLA_ABORT("Not implemented"); + } + + template< typename PdfField_T > + static void get( Matrix3< real_t > & /* pressureTensor */, const UniformGridGPU_LatticeModel & /* latticeModel */, const PdfField_T & /* pdf */, + const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */ ) + { + WALBERLA_ABORT("Not implemented"); + } +}; + + +template<> +struct ShearRate<UniformGridGPU_LatticeModel> +{ + template< typename FieldPtrOrIterator > + static inline real_t get( const UniformGridGPU_LatticeModel & /* latticeModel */, const FieldPtrOrIterator & /* it */, + const Vector3< real_t > & /* velocity */, const real_t /* rho */) + { + WALBERLA_ABORT("Not implemented"); + return real_t(0.0); + } + + template< typename PdfField_T > + static inline real_t get( const UniformGridGPU_LatticeModel & latticeModel, + const PdfField_T & /* pdf */, const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */, + const Vector3< real_t > & /* velocity */, const real_t /* rho */ ) + { + WALBERLA_ABORT("Not implemented"); + return real_t(0.0); + } + + static inline real_t get( const std::vector< real_t > & /* nonEquilibrium */, const real_t /* relaxationParam */, + const real_t /* rho */ = real_t(1) ) + { + WALBERLA_ABORT("Not implemented"); + return real_t(0.0); + } +}; + + +} // namespace lbm +} // namespace walberla + + + +#ifdef WALBERLA_CXX_COMPILER_IS_GNU +#pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6bc2cb24ef5c3c215334a6db9bb0381e2642fcc3 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.cu @@ -0,0 +1,324 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file .cpp +//! \\ingroup lbm +//! \\author lbmpy +//====================================================================================================================== + +#include <cmath> + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "UniformGridGPU_LbKernel.h" + + +#define FUNC_PREFIX __global__ + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wfloat-equal" +# pragma GCC diagnostic ignored "-Wshadow" +# pragma GCC diagnostic ignored "-Wconversion" +#endif + +using namespace std; + +namespace walberla { +namespace pystencils { + +namespace internal_UniformGridGPU_LbKernel { +static FUNC_PREFIX void UniformGridGPU_LbKernel(double * const _data_pdfs, double * _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) +{ + if (blockDim.x*blockIdx.x + threadIdx.x + 1 < _size_pdfs_0 - 1 && blockDim.y*blockIdx.y + threadIdx.y + 1 < _size_pdfs_1 - 1 && blockDim.z*blockIdx.z + threadIdx.z + 1 < _size_pdfs_2 - 1) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x + 1; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y + 1; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z + 1; + double * const _data_pdfs_10_21_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + const double xi_18 = -_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * const _data_pdfs_11_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + const double xi_19 = -_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * const _data_pdfs_11_21_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + const double xi_20 = -_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_2m1_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * const _data_pdfs_11_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * const _data_pdfs_1m1_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * const _data_pdfs_10_21_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + const double vel0Term = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + double * const _data_pdfs_1m1_2m1_311 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * const _data_pdfs_1m1_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * const _data_pdfs_1m1_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * const _data_pdfs_1m1_21_315 = _data_pdfs + _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + const double vel1Term = _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_2m1_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * const _data_pdfs_11_2m1_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * const _data_pdfs_10_2m1_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + const double vel2Term = _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0] + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; + double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * const _data_pdfs_11_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * const _data_pdfs_10_21_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + const double rho = vel0Term + vel1Term + vel2Term + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; + const double xi_27 = rho*-0.333333333333333; + const double u_0 = vel0Term + xi_18 + xi_19 - _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double xi_23 = (u_0*u_0); + const double u_1 = vel1Term + xi_19 + xi_20 - _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0] - _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double xi_21 = -u_1; + const double xi_24 = (u_1*u_1); + const double u_2 = vel2Term + xi_18 + xi_20 - _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0] + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0] + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; + const double xi_22 = -u_2; + const double xi_25 = (u_2*u_2); + const double u0Mu1 = u_0 + xi_21; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + xi_22; + const double u0Mu2 = u_0 + xi_22; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = rho - xi_23 - xi_24 - xi_25; + const double xi_26 = f_eq_common + rho*-0.666666666666667; + const double xi_28 = f_eq_common + xi_25 + xi_27; + const double xi_29 = f_eq_common + xi_23 + xi_27; + const double xi_30 = f_eq_common + xi_24 + xi_27; + const double xi_2 = xi_24*2 + xi_26; + const double xi_3 = xi_23*2 + xi_26; + const double xi_4 = xi_25*2 + xi_26; + const double xi_6 = u0Mu1*2; + const double xi_7 = (u0Mu1*u0Mu1)*3 + xi_28; + const double xi_8 = u0Pu1*2; + const double xi_9 = (u0Pu1*u0Pu1)*3 + xi_28; + const double xi_10 = u1Pu2*2; + const double xi_11 = (u1Pu2*u1Pu2)*3 + xi_29; + const double xi_12 = u1Mu2*2; + const double xi_13 = (u1Mu2*u1Mu2)*3 + xi_29; + const double xi_14 = u0Mu2*2; + const double xi_15 = (u0Mu2*u0Mu2)*3 + xi_30; + const double xi_16 = u0Pu2*2; + const double xi_17 = (u0Pu2*u0Pu2)*3 + xi_30; + const double xi_1 = omega*0.166666666666667; + const double xi_5 = omega*0.0416666666666667; + double * _data_pdfs_tmp_10_20_30 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; + _data_pdfs_tmp_10_20_30[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.333333333333333 - _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_31 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + _data_pdfs_tmp_10_20_31[_stride_pdfs_0*ctr_0] = xi_1*(u_1 + xi_2 - 6*_data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_20_31[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_32 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + _data_pdfs_tmp_10_20_32[_stride_pdfs_0*ctr_0] = xi_1*(xi_2 + xi_21 - 6*_data_pdfs_11_20_32[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_20_32[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_33 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + _data_pdfs_tmp_10_20_33[_stride_pdfs_0*ctr_0] = xi_1*(-u_0 + xi_3 - 6*_data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_34 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + _data_pdfs_tmp_10_20_34[_stride_pdfs_0*ctr_0] = xi_1*(u_0 + xi_3 - 6*_data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_35 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + _data_pdfs_tmp_10_20_35[_stride_pdfs_0*ctr_0] = xi_1*(u_2 + xi_4 - 6*_data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_2m1_35[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_36 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + _data_pdfs_tmp_10_20_36[_stride_pdfs_0*ctr_0] = xi_1*(xi_22 + xi_4 - 6*_data_pdfs_10_21_36[_stride_pdfs_0*ctr_0]) + _data_pdfs_10_21_36[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_37 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_pdfs_tmp_10_20_37[_stride_pdfs_0*ctr_0] = xi_5*(-xi_6 + xi_7 - 24*_data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_1m1_20_37[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_38 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_pdfs_tmp_10_20_38[_stride_pdfs_0*ctr_0] = xi_5*(xi_8 + xi_9 - 24*_data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_1m1_20_38[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_39 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_pdfs_tmp_10_20_39[_stride_pdfs_0*ctr_0] = xi_5*(-xi_8 + xi_9 - 24*_data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_11_20_39[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_310 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_pdfs_tmp_10_20_310[_stride_pdfs_0*ctr_0] = xi_5*(xi_6 + xi_7 - 24*_data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_11_20_310[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_311 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_pdfs_tmp_10_20_311[_stride_pdfs_0*ctr_0] = xi_5*(xi_10 + xi_11 - 24*_data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_2m1_311[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_312 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_pdfs_tmp_10_20_312[_stride_pdfs_0*ctr_0] = xi_5*(-xi_12 + xi_13 - 24*_data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_2m1_312[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_313 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_pdfs_tmp_10_20_313[_stride_pdfs_0*ctr_0] = xi_5*(-xi_14 + xi_15 - 24*_data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_2m1_313[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_314 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_pdfs_tmp_10_20_314[_stride_pdfs_0*ctr_0] = xi_5*(xi_16 + xi_17 - 24*_data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_2m1_314[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_315 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_pdfs_tmp_10_20_315[_stride_pdfs_0*ctr_0] = xi_5*(xi_12 + xi_13 - 24*_data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0]) + _data_pdfs_1m1_21_315[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_316 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_pdfs_tmp_10_20_316[_stride_pdfs_0*ctr_0] = xi_5*(-xi_10 + xi_11 - 24*_data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]) + _data_pdfs_11_21_316[_stride_pdfs_0*ctr_0]; + double * _data_pdfs_tmp_10_20_317 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_pdfs_tmp_10_20_317[_stride_pdfs_0*ctr_0] = xi_5*(-xi_16 + xi_17 - 24*_data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]) + _data_pdfs_10_21_317[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + double * _data_pdfs_tmp_10_20_318 = _data_pdfs_tmp + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_pdfs_tmp_10_20_318[_stride_pdfs_0*ctr_0] = xi_5*(xi_14 + xi_15 - 24*_data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]) + _data_pdfs_10_21_318[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + } +} +} + +void UniformGridGPU_LbKernel::operator() ( IBlock * block , cudaStream_t stream ) +{ + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + cuda::GPUField<double> * pdfs_tmp; + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + + WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(-1, -1, -1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-1, -int_c(pdfs_tmp->nrOfGhostLayers())); + double * _data_pdfs_tmp = pdfs_tmp->dataAt(-1, -1, -1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(pdfs->xSize() + 2)); + const int64_t _size_pdfs_0 = int64_t(pdfs->xSize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(pdfs->ySize() + 2)); + const int64_t _size_pdfs_1 = int64_t(pdfs->ySize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(pdfs->zSize() + 2)); + const int64_t _size_pdfs_2 = int64_t(pdfs->zSize() + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); + dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); + internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); + pdfs->swapDataPointers(pdfs_tmp); + +} + + + +void UniformGridGPU_LbKernel::inner( IBlock * block , cudaStream_t stream ) +{ + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + cuda::GPUField<double> * pdfs_tmp; + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(-1); + + WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(inner.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(inner.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(inner.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + double * _data_pdfs_tmp = pdfs_tmp->dataAt(inner.xMin() - 1, inner.yMin() - 1, inner.zMin() - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(inner.xSize() + 2)); + const int64_t _size_pdfs_0 = int64_t(inner.xSize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(inner.ySize() + 2)); + const int64_t _size_pdfs_1 = int64_t(inner.ySize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(inner.zSize() + 2)); + const int64_t _size_pdfs_2 = int64_t(inner.zSize() + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); + dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); + internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, stream>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} + + +void UniformGridGPU_LbKernel::outer( IBlock * block , cudaStream_t stream ) +{ + static std::vector<CellInterval> layers; + + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + cuda::GPUField<double> * pdfs_tmp; + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + + + if( layers.size() == 0 ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, 1, false); + layers.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, 1, false); + layers.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, 1, false); + ci.expand(Cell(0, 0, -1)); + layers.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, 1, false); + ci.expand(Cell(0, 0, -1)); + layers.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, 1, false); + ci.expand(Cell(0, -1, -1)); + layers.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, 1, false); + ci.expand(Cell(0, -1, -1)); + layers.push_back(ci); + } + + + { + auto parallelSection_ = parallelStreams_.parallelSection( stream ); + for( auto & ci: layers ) + { + parallelSection_.run([&]( auto s ) { + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin() - 1, -int_c(pdfs_tmp->nrOfGhostLayers())); + double * _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin() - 1, ci.yMin() - 1, ci.zMin() - 1, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 2)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 2)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 2); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 2)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 2); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)), int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)), int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2))); + dim3 _grid(int(( (_size_pdfs_0 - 2) % int(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) == 0 ? (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) : ( (int64_t)(_size_pdfs_0 - 2) / (int64_t)(((128 < _size_pdfs_0 - 2) ? 128 : _size_pdfs_0 - 2)) ) +1 )), int(( (_size_pdfs_1 - 2) % int(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) == 0 ? (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) : ( (int64_t)(_size_pdfs_1 - 2) / (int64_t)(((1 < _size_pdfs_1 - 2) ? 1 : _size_pdfs_1 - 2)) ) +1 )), int(( (_size_pdfs_2 - 2) % int(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) == 0 ? (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) : ( (int64_t)(_size_pdfs_2 - 2) / (int64_t)(((1 < _size_pdfs_2 - 2) ? 1 : _size_pdfs_2 - 2)) ) +1 ))); + internal_UniformGridGPU_LbKernel::UniformGridGPU_LbKernel<<<_grid, _block, 0, s>>>(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); + }); + } + } + + + pdfs->swapDataPointers(pdfs_tmp); + +} + + +} // namespace pystencils +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..fc4e13d65e2dc3a43fc41a3afa1924b23dfbdd5f --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_LbKernel.h @@ -0,0 +1,82 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UniformGridGPU_LbKernel.h +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" + +#include "cuda/GPUField.h" +#include "cuda/ParallelStreams.h" +#include "field/SwapableCompare.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" + +#include <set> + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +namespace walberla { +namespace pystencils { + + +class UniformGridGPU_LbKernel +{ +public: + UniformGridGPU_LbKernel( BlockDataID pdfsID_, double omega_) + : pdfsID(pdfsID_), omega(omega_) + {}; + + void operator() ( IBlock * block , cudaStream_t stream = 0 ); + + void inner( IBlock * block , cudaStream_t stream = 0 ); + void outer( IBlock * block , cudaStream_t stream = 0 ); + + void setOuterPriority(int priority ) { + + parallelStreams_.setStreamPriority(priority); + + } +private: + BlockDataID pdfsID; + double omega; + + std::set< cuda::GPUField<double> *, field::SwapableCompare< cuda::GPUField<double> * > > cache_pdfs_; + + + cuda::ParallelStreams parallelStreams_; + +}; + + +} // namespace pystencils +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu new file mode 100644 index 0000000000000000000000000000000000000000..78d9848e0533fa812d4ec28adbc310fbafa0202b --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.cu @@ -0,0 +1,121 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UniformGridGPU_NoSlip.cpp +//! \\ingroup lbm +//! \\author lbmpy +//====================================================================================================================== + +#include <cmath> + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "UniformGridGPU_NoSlip.h" +#include "cuda/ErrorChecking.h" + + +#define FUNC_PREFIX __global__ + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +namespace internal_boundary_UniformGridGPU_NoSlip { +static FUNC_PREFIX void boundary_UniformGridGPU_NoSlip(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize) + { + uint8_t * const _data_indexVector_10 = _data_indexVector; + const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + uint8_t * const _data_indexVector_14 = _data_indexVector + 4; + const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + uint8_t * const _data_indexVector_18 = _data_indexVector + 8; + const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + + + const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 }; + const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 }; + const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 }; + const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 }; + + + const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; + + uint8_t * const _data_indexVector_112 = _data_indexVector + 12; + const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir]; + double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir; + _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x]; + } +} +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + +void UniformGridGPU_NoSlip::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream ) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + + auto pointer = indexVectors->pointerGpu(type); + + + int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1)); + dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1)); + internal_boundary_UniformGridGPU_NoSlip::boundary_UniformGridGPU_NoSlip<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); +} + +void UniformGridGPU_NoSlip::operator() ( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::ALL, stream ); +} + +void UniformGridGPU_NoSlip::inner( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::INNER, stream ); +} + +void UniformGridGPU_NoSlip::outer( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::OUTER, stream ); +} + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h new file mode 100644 index 0000000000000000000000000000000000000000..536a99a66fd3172c977a2c3acb3bed889fe04c08 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_NoSlip.h @@ -0,0 +1,364 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UniformGridGPU_NoSlip.h +//! \\author pystencils +//====================================================================================================================== + + +#include "core/DataTypes.h" + +#include "cuda/GPUField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" + +#include <set> +#include <vector> + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class UniformGridGPU_NoSlip +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() : cpuVectors_(NUM_TYPES) {} + bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; } + + ~IndexVectors() { + for( auto & gpuVec: gpuVectors_) + cudaFree( gpuVec ); + } + + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); } + + IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; } + + + void syncGPU() + { + gpuVectors_.resize( cpuVectors_.size() ); + for(int i=0; i < NUM_TYPES; ++i ) + { + auto & gpuVec = gpuVectors_[i]; + auto & cpuVec = cpuVectors_[i]; + cudaFree( gpuVec ); + cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() ); + cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice ); + } + } + + private: + std::vector<CpuIndexVector> cpuVectors_; + + using GpuIndexVector = IndexInfo *; + std::vector<GpuIndexVector> gpuVectors_; + + }; + + + UniformGridGPU_NoSlip( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_ ) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_NoSlip"); + }; + + void operator() ( IBlock * block , cudaStream_t stream = 0 ); + void inner( IBlock * block , cudaStream_t stream = 0 ); + void outer( IBlock * block , cudaStream_t stream = 0 ); + + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + for( auto it = flagField->begin(); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + } + + indexVectors->syncGPU(); + } + +private: + void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 ); + + BlockDataID indexVectorID; + + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu new file mode 100644 index 0000000000000000000000000000000000000000..ada9933626237d4e889b071408e737733521124d --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.cu @@ -0,0 +1,1656 @@ +#include "stencil/Directions.h" +#include "core/cell/CellInterval.h" +#include "cuda/GPUField.h" +#include "core/DataTypes.h" +#include "UniformGridGPU_PackInfo.h" + + +#define FUNC_PREFIX __global__ + + +namespace walberla { +namespace pystencils { + +using walberla::cell::CellInterval; +using walberla::stencil::Direction; + + + +namespace internal_pack_N { +static FUNC_PREFIX void pack_N(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_BE { +static FUNC_PREFIX void pack_BE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_B { +static FUNC_PREFIX void pack_B(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_E { +static FUNC_PREFIX void pack_E(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_W { +static FUNC_PREFIX void pack_W(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_TN { +static FUNC_PREFIX void pack_TN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_T { +static FUNC_PREFIX void pack_T(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_BN { +static FUNC_PREFIX void pack_BN(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_SE { +static FUNC_PREFIX void pack_SE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_S { +static FUNC_PREFIX void pack_S(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x] = _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; + double * const _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4] = _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_BW { +static FUNC_PREFIX void pack_BW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_BS { +static FUNC_PREFIX void pack_BS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_TW { +static FUNC_PREFIX void pack_TW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_C { +static FUNC_PREFIX void pack_C(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_NE { +static FUNC_PREFIX void pack_NE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_TS { +static FUNC_PREFIX void pack_TS(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_NW { +static FUNC_PREFIX void pack_NW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_SW { +static FUNC_PREFIX void pack_SW(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0]; + } +} +} + +namespace internal_pack_TE { +static FUNC_PREFIX void pack_TE(double * _data_buffer, double * const _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * const _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x] = _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0]; + } +} +} + + + +namespace internal_unpack_S { +static FUNC_PREFIX void unpack_S(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_31 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + _data_pdfs_10_20_31[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_TW { +static FUNC_PREFIX void unpack_TW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_T { +static FUNC_PREFIX void unpack_T(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_36 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + _data_pdfs_10_20_36[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_W { +static FUNC_PREFIX void unpack_W(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_34 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + _data_pdfs_10_20_34[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_318 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + _data_pdfs_10_20_318[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_E { +static FUNC_PREFIX void unpack_E(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_33 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + _data_pdfs_10_20_33[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_BS { +static FUNC_PREFIX void unpack_BS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_B { +static FUNC_PREFIX void unpack_B(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_35 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + _data_pdfs_10_20_35[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_311 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + _data_pdfs_10_20_311[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_TS { +static FUNC_PREFIX void unpack_TS(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_315 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + _data_pdfs_10_20_315[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_NW { +static FUNC_PREFIX void unpack_NW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_N { +static FUNC_PREFIX void unpack_N(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_32 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + _data_pdfs_10_20_32[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x]; + double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 1]; + double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 2]; + double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 3]; + double * _data_pdfs_10_20_310 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + _data_pdfs_10_20_310[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(5*blockDim.z*blockIdx.z + 5*threadIdx.z) + _size_pdfs_0*(5*blockDim.y*blockIdx.y + 5*threadIdx.y) + 5*blockDim.x*blockIdx.x + 5*threadIdx.x + 4]; + } +} +} + +namespace internal_unpack_TE { +static FUNC_PREFIX void unpack_TE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_317 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + _data_pdfs_10_20_317[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_TN { +static FUNC_PREFIX void unpack_TN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_316 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + _data_pdfs_10_20_316[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_BE { +static FUNC_PREFIX void unpack_BE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_313 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + _data_pdfs_10_20_313[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_C { +static FUNC_PREFIX void unpack_C(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_30 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2; + _data_pdfs_10_20_30[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_SW { +static FUNC_PREFIX void unpack_SW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_38 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + _data_pdfs_10_20_38[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_BN { +static FUNC_PREFIX void unpack_BN(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_312 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + _data_pdfs_10_20_312[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_SE { +static FUNC_PREFIX void unpack_SE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_37 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + _data_pdfs_10_20_37[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_NE { +static FUNC_PREFIX void unpack_NE(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_39 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + _data_pdfs_10_20_39[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + +namespace internal_unpack_BW { +static FUNC_PREFIX void unpack_BW(double * const _data_buffer, double * _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < _size_pdfs_0 && blockDim.y*blockIdx.y + threadIdx.y < _size_pdfs_1 && blockDim.z*blockIdx.z + threadIdx.z < _size_pdfs_2) + { + const int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x; + const int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y; + const int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z; + double * _data_pdfs_10_20_314 = _data_pdfs + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + _data_pdfs_10_20_314[_stride_pdfs_0*ctr_0] = _data_buffer[_size_pdfs_0*_size_pdfs_1*(blockDim.z*blockIdx.z + threadIdx.z) + _size_pdfs_0*(blockDim.y*blockIdx.y + threadIdx.y) + blockDim.x*blockIdx.x + threadIdx.x]; + } +} +} + + + + +void UniformGridGPU_PackInfo::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream) +{ + double * buffer = reinterpret_cast<double*>(byte_buffer); + + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + + CellInterval ci; + pdfs->getSliceBeforeGhostLayer(dir, ci, 1, false); + + switch( dir ) + { + case stencil::N: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_N::pack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BE: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_BE::pack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::B: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_B::pack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::E: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_E::pack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::W: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_W::pack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TN: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_TN::pack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::T: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_T::pack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BN: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_BN::pack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::SE: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_SE::pack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::S: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_S::pack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BW: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_BW::pack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BS: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_BS::pack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TW: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_TW::pack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::C: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_C::pack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2); + break; + } + + case stencil::NE: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_NE::pack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TS: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_TS::pack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::NW: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_NW::pack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::SW: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_SW::pack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TE: + { + double * _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_pack_TE::pack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + + default: + WALBERLA_ASSERT(false); + } +} + + +void UniformGridGPU_PackInfo::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream) +{ + double * buffer = reinterpret_cast<double*>(byte_buffer); + + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + + CellInterval ci; + pdfs->getGhostRegion(dir, ci, 1, false); + + switch( dir ) + { + case stencil::S: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_S::unpack_S<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TW: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_TW::unpack_TW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::T: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_T::unpack_T<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::W: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_W::unpack_W<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::E: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_E::unpack_E<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BS: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_BS::unpack_BS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::B: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_B::unpack_B<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TS: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_TS::unpack_TS<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::NW: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_NW::unpack_NW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::N: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_N::unpack_N<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TE: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_TE::unpack_TE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::TN: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_TN::unpack_TN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BE: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_BE::unpack_BE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::C: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_C::unpack_C<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2); + break; + } + + case stencil::SW: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_SW::unpack_SW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BN: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_BN::unpack_BN<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::SE: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_SE::unpack_SE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::NE: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_NE::unpack_NE<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + case stencil::BW: + { + double * const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())); + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(ci.xSize() + 0)); + const int64_t _size_pdfs_0 = int64_t(ci.xSize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(ci.ySize() + 0)); + const int64_t _size_pdfs_1 = int64_t(ci.ySize() + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(ci.zSize() + 0)); + const int64_t _size_pdfs_2 = int64_t(ci.zSize() + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)), int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)), int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2))); + dim3 _grid(int(( (_size_pdfs_0) % int(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) == 0 ? (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) : ( (int64_t)(_size_pdfs_0) / (int64_t)(((16 < _size_pdfs_0) ? 16 : _size_pdfs_0)) ) +1 )), int(( (_size_pdfs_1) % int(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) == 0 ? (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) : ( (int64_t)(_size_pdfs_1) / (int64_t)(((16 < _size_pdfs_1) ? 16 : _size_pdfs_1)) ) +1 )), int(( (_size_pdfs_2) % int(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) == 0 ? (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) : ( (int64_t)(_size_pdfs_2) / (int64_t)(((1 < _size_pdfs_2) ? 1 : _size_pdfs_2)) ) +1 ))); + internal_unpack_BW::unpack_BW<<<_grid, _block, 0, stream>>>(_data_buffer, _data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3); + break; + } + + + default: + WALBERLA_ASSERT(false); + } +} + + +uint_t UniformGridGPU_PackInfo::size(stencil::Direction dir, IBlock * block) +{ + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + + CellInterval ci; + pdfs->getGhostRegion(dir, ci, 1, false); + + uint_t elementsPerCell = 0; + + switch( dir ) + { + case stencil::N: + elementsPerCell = 5; + break; + + case stencil::BE: + elementsPerCell = 1; + break; + + case stencil::B: + elementsPerCell = 5; + break; + + case stencil::E: + elementsPerCell = 5; + break; + + case stencil::W: + elementsPerCell = 5; + break; + + case stencil::TN: + elementsPerCell = 1; + break; + + case stencil::T: + elementsPerCell = 5; + break; + + case stencil::BN: + elementsPerCell = 1; + break; + + case stencil::SE: + elementsPerCell = 1; + break; + + case stencil::S: + elementsPerCell = 5; + break; + + case stencil::BW: + elementsPerCell = 1; + break; + + case stencil::BS: + elementsPerCell = 1; + break; + + case stencil::TW: + elementsPerCell = 1; + break; + + case stencil::C: + elementsPerCell = 1; + break; + + case stencil::NE: + elementsPerCell = 1; + break; + + case stencil::TS: + elementsPerCell = 1; + break; + + case stencil::NW: + elementsPerCell = 1; + break; + + case stencil::SW: + elementsPerCell = 1; + break; + + case stencil::TE: + elementsPerCell = 1; + break; + + default: + elementsPerCell = 0; + } + return ci.numCells() * elementsPerCell * sizeof( double ); +} + + + +} // namespace pystencils +} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..4c9ab98656f0af45079e2e6dca0f2a6b37e5e911 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_PackInfo.h @@ -0,0 +1,34 @@ +#include "stencil/Directions.h" +#include "core/cell/CellInterval.h" +#include "cuda/GPUField.h" +#include "core/DataTypes.h" +#include "domain_decomposition/IBlock.h" +#include "cuda/communication/GeneratedGPUPackInfo.h" + + +#define FUNC_PREFIX __global__ + + +namespace walberla { +namespace pystencils { + + +class UniformGridGPU_PackInfo : public ::walberla::cuda::GeneratedGPUPackInfo +{ +public: + UniformGridGPU_PackInfo( BlockDataID pdfsID_ ) + : pdfsID(pdfsID_) + {}; + + + virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); + virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); + virtual uint_t size (stencil::Direction dir, IBlock * block); + +private: + BlockDataID pdfsID; +}; + + +} // namespace pystencils +} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu new file mode 100644 index 0000000000000000000000000000000000000000..a774d25115772e39e236ecc6812e8612ded0d317 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.cu @@ -0,0 +1,121 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UniformGridGPU_UBB.cpp +//! \\ingroup lbm +//! \\author lbmpy +//====================================================================================================================== + +#include <cmath> + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "UniformGridGPU_UBB.h" +#include "cuda/ErrorChecking.h" + + +#define FUNC_PREFIX __global__ + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +namespace internal_boundary_UniformGridGPU_UBB { +static FUNC_PREFIX void boundary_UniformGridGPU_UBB(uint8_t * const _data_indexVector, double * _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t indexVectorSize) +{ + if (blockDim.x*blockIdx.x + threadIdx.x < indexVectorSize) + { + uint8_t * const _data_indexVector_10 = _data_indexVector; + const int32_t x = *((int32_t *)(& _data_indexVector_10[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + uint8_t * const _data_indexVector_14 = _data_indexVector + 4; + const int32_t y = *((int32_t *)(& _data_indexVector_14[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + uint8_t * const _data_indexVector_18 = _data_indexVector + 8; + const int32_t z = *((int32_t *)(& _data_indexVector_18[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + + + const int64_t cx [] = { 0, 0, 0, -1, 1, 0, 0, -1, 1, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1 }; + const int64_t cy [] = { 0, 1, -1, 0, 0, 0, 0, 1, 1, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0 }; + const int64_t cz [] = { 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 }; + const int invdir [] = { 0, 2, 1, 4, 3, 6, 5, 10, 9, 8, 7, 16, 15, 18, 17, 12, 11, 14, 13 }; + + + const double weights [] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 }; + + uint8_t * const _data_indexVector_112 = _data_indexVector + 12; + const int32_t dir = *((int32_t *)(& _data_indexVector_112[16*blockDim.x*blockIdx.x + 16*threadIdx.x])); + double * _data_pdfs_m3B5BEDEA5094B12F = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_1*cy[dir] + _stride_pdfs_2*z + _stride_pdfs_2*cz[dir] + _stride_pdfs_3*invdir[dir]; + double * _data_pdfs_10_20_m2227275638DDD757 = _data_pdfs + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir; + _data_pdfs_m3B5BEDEA5094B12F[_stride_pdfs_0*x + _stride_pdfs_0*cx[dir]] = -0.30000000000000004*cx[dir]*weights[dir] + _data_pdfs_10_20_m2227275638DDD757[_stride_pdfs_0*x]; + } +} +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + +void UniformGridGPU_UBB::run( IBlock * block, IndexVectors::Type type , cudaStream_t stream ) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + + auto pointer = indexVectors->pointerGpu(type); + + + int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< cuda::GPUField<double> >(pdfsID); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())); + double * _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(pdfs->fStride()); + dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1)); + dim3 _grid(int(( (indexVectorSize) % int(((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ? (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) : ( (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) ) +1 )), int(1), int(1)); + internal_boundary_UniformGridGPU_UBB::boundary_UniformGridGPU_UBB<<<_grid, _block, 0, stream>>>(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); +} + +void UniformGridGPU_UBB::operator() ( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::ALL, stream ); +} + +void UniformGridGPU_UBB::inner( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::INNER, stream ); +} + +void UniformGridGPU_UBB::outer( IBlock * block, cudaStream_t stream ) +{ + run( block, IndexVectors::OUTER, stream ); +} + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h new file mode 100644 index 0000000000000000000000000000000000000000..3ad393854b98e951a08a75a5632a9fa4a6b210ed --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_UBB.h @@ -0,0 +1,364 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UniformGridGPU_UBB.h +//! \\author pystencils +//====================================================================================================================== + + +#include "core/DataTypes.h" + +#include "cuda/GPUField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" + +#include <set> +#include <vector> + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class UniformGridGPU_UBB +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() : cpuVectors_(NUM_TYPES) {} + bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; } + + ~IndexVectors() { + for( auto & gpuVec: gpuVectors_) + cudaFree( gpuVec ); + } + + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return &(cpuVectors_[t][0]); } + + IndexInfo * pointerGpu(Type t) { return gpuVectors_[t]; } + + + void syncGPU() + { + gpuVectors_.resize( cpuVectors_.size() ); + for(int i=0; i < NUM_TYPES; ++i ) + { + auto & gpuVec = gpuVectors_[i]; + auto & cpuVec = cpuVectors_[i]; + cudaFree( gpuVec ); + cudaMalloc( &gpuVec, sizeof(IndexInfo) * cpuVec.size() ); + cudaMemcpy( gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice ); + } + } + + private: + std::vector<CpuIndexVector> cpuVectors_; + + using GpuIndexVector = IndexInfo *; + std::vector<GpuIndexVector> gpuVectors_; + + }; + + + UniformGridGPU_UBB( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_ ) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UniformGridGPU_UBB"); + }; + + void operator() ( IBlock * block , cudaStream_t stream = 0 ); + void inner( IBlock * block , cudaStream_t stream = 0 ); + void outer( IBlock * block , cudaStream_t stream = 0 ); + + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + for( auto it = flagField->begin(); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + + } + + indexVectors->syncGPU(); + } + +private: + void run( IBlock * block, IndexVectors::Type type, cudaStream_t stream = 0 ); + + BlockDataID indexVectorID; + + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file