diff --git a/.clang-tidy b/.clang-tidy index 7127535c8e66a377978897492b26cb954201dc20..f0e5933ad55dc18c06f14f2c6ef06dc3226eda22 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -8,6 +8,7 @@ boost-*, bugprone-*, -bugprone-branch-clone, -bugprone-exception-escape, +-bugprone-easily-swappable-parameters, misc-*, -misc-misplaced-const, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a924b0fb33ec4aed389d23f8039fc5fbbde170c0..7a9ba191330df560d19193b583226e268feedca7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -413,7 +413,7 @@ gcc_8_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -437,7 +437,7 @@ gcc_8_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -459,7 +459,7 @@ gcc_8_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -480,7 +480,7 @@ gcc_8_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -505,7 +505,7 @@ gcc_8_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -528,7 +528,7 @@ gcc_8_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -550,7 +550,7 @@ gcc_8_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -575,7 +575,7 @@ gcc_9_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -599,7 +599,7 @@ gcc_9_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -621,7 +621,7 @@ gcc_9_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -642,7 +642,7 @@ gcc_9_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -667,7 +667,7 @@ gcc_9_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -690,7 +690,7 @@ gcc_9_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -712,7 +712,7 @@ gcc_9_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -737,7 +737,7 @@ gcc_10_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -761,7 +761,7 @@ gcc_10_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -783,7 +783,7 @@ gcc_10_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -804,7 +804,7 @@ gcc_10_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -829,7 +829,7 @@ gcc_10_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -852,7 +852,7 @@ gcc_10_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -874,7 +874,7 @@ gcc_10_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -899,7 +899,7 @@ gcc_11_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -923,7 +923,7 @@ gcc_11_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -945,7 +945,7 @@ gcc_11_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -966,7 +966,7 @@ gcc_11_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -991,7 +991,7 @@ gcc_11_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1014,7 +1014,7 @@ gcc_11_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1036,7 +1036,7 @@ gcc_11_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1341,7 +1341,7 @@ clang_11.0_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1365,7 +1365,7 @@ clang_11.0_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1387,7 +1387,7 @@ clang_11.0_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1408,7 +1408,7 @@ clang_11.0_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1433,7 +1433,7 @@ clang_11.0_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1456,7 +1456,7 @@ clang_11.0_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1478,7 +1478,7 @@ clang_11.0_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1503,7 +1503,7 @@ clang_12.0_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1527,7 +1527,7 @@ clang_12.0_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1549,7 +1549,7 @@ clang_12.0_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1570,7 +1570,7 @@ clang_12.0_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1595,7 +1595,7 @@ clang_12.0_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1618,7 +1618,7 @@ clang_12.0_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1640,7 +1640,7 @@ clang_12.0_hybrid_dbg_sp: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1665,7 +1665,7 @@ clang_13.0_serial: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1689,7 +1689,7 @@ clang_13.0_mpionly: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1711,7 +1711,7 @@ clang_13.0_hybrid: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1729,7 +1729,7 @@ clang_13.0_serial_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1751,7 +1751,7 @@ clang_13.0_mpionly_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1771,7 +1771,7 @@ clang_13.0_hybrid_dbg: extends: .build_template image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list @@ -1791,7 +1791,7 @@ clang_13.0_hybrid_dbg_sp: image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0 stage: pretest before_script: - - pip3 install lbmpy==1.1 jinja2 pytest + - pip3 install lbmpy==1.2 jinja2 pytest - cd python - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla - pip3 list diff --git a/CMakeLists.txt b/CMakeLists.txt index 429e36ea2c67d660fb5428c5bfa8960a4fce70a1..a0a977795fac8b9f486d7f0cf82e025e2300858f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -603,7 +603,7 @@ endif () ## ############################################################################################################################# if ( WALBERLA_BUILD_WITH_CODEGEN ) - set(LBMPY_MIN_VERSION 1.1) + set(LBMPY_MIN_VERSION 1.2) execute_process(COMMAND ${Python_EXECUTABLE} -c "import lbmpy; print(lbmpy.__version__)" RESULT_VARIABLE LBMPY_FOUND OUTPUT_VARIABLE LBMPY_VERSION) if(NOT LBMPY_FOUND EQUAL 0) diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index 4b95602d6daca9adec9a4932e4f12707f1fb0878..f37d24767eb383e55b1ff2764770ee525bb54c68 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -25,10 +25,12 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) add_subdirectory( FlowAroundSphereCodeGen ) add_subdirectory( UniformGridCPU ) add_subdirectory( PhaseFieldAllenCahn ) + add_subdirectory( NonUniformGridCPU ) endif() if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT ) add_subdirectory( UniformGridGPU ) + add_subdirectory( NonUniformGridGPU ) endif() endif() diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt index 4010341a3d5a4ba93558eae60e95f2fcd292bcbc..40a17bda2180db64d3e7887ae8d195e8e85d7656 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt +++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt @@ -15,6 +15,6 @@ if (WALBERLA_BUILD_WITH_CUDA) waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) else () - waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp - DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) + waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp + DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated) endif (WALBERLA_BUILD_WITH_CUDA) diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py index c170a8101422dadce166196b011444f4faf08ccb..7dd9d531b9730e9851e0f8cf53b7b48c4ae930a0 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py @@ -47,6 +47,7 @@ with CodeGeneration() as ctx: pdfs=pdfs, density=1.0, streaming_pattern=streaming_pattern, previous_timestep=timesteps[0]) + setter_assignments = setter_assignments.new_without_unused_subexpressions() # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True} diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py index 41d38d16218d97a633ccca62c951356b16c2f446..673c10e4d7a2a04117d2cb3a25ab1999d94311bd 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py @@ -4,10 +4,10 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity class Scenario: def __init__(self): - self.timesteps = 1001 + self.timesteps = 10 self.vtkWriteFrequency = 100 - self.cells = (384, 128, 128) + self.cells = (64, 32, 32) self.blocks = (1, 1, 1) self.periodic = (0, 0, 0) diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f332e065fed35fa99367127e9d44b211849cc7b3 --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt @@ -0,0 +1,15 @@ +waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.py" ) +waLBerla_link_files_to_builddir( "simulation_setup" ) + +waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated + FILE NonUniformGridCPU.py + OUT_FILES NonUniformGridCPUStorageSpecification.h NonUniformGridCPUStorageSpecification.cpp + NonUniformGridCPUSweepCollection.h NonUniformGridCPUSweepCollection.cpp + NoSlip.h NoSlip.cpp + UBB.h UBB.cpp + NonUniformGridCPUBoundaryCollection.h + NonUniformGridCPUInfoHeader.h) +waLBerla_add_executable( NAME NonUniformGridCPU + FILES NonUniformGridCPU.cpp + DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated ) \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3a44867523a9b94f8f2f9b57bc8b5aeb6aac6819 --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp @@ -0,0 +1,311 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonUniformGridCPU.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" +#include "blockforest/SetupBlockForest.h" +#include "blockforest/loadbalancing/StaticCurve.h" + +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" + +#include "field/AddToStorage.h" +#include "field/FlagField.h" +#include "field/vtk/VTKWriter.h" + +#include "geometry/InitBoundaryHandling.h" + +#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/refinement/BasicRecursiveTimeStep.h" +#include "lbm_generated/evaluation/PerformanceEvaluation.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" + +#include "timeloop/SweepTimeloop.h" + +#include <cmath> + +#include "NonUniformGridCPUInfoHeader.h" + +using namespace walberla; + +using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification; +using Stencil_T = StorageSpecification_T::Stencil; +using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil; + +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; +using FlagField_T = FlagField< uint8_t >; +using BoundaryCollection_T = lbm::NonUniformGridCPUBoundaryCollection< FlagField_T >; + +using SweepCollection_T = lbm::NonUniformGridCPUSweepCollection; + +using blockforest::communication::NonUniformBufferedScheme; +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; + + +class LDCRefinement +{ + private: + const uint_t refinementDepth_; + + public: + LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) + { + std::vector< SetupBlock* > blocks; + forest.getBlocks(blocks); + + for (auto block : blocks) + { + if (forest.atDomainYMaxBorder(*block)) + { + if (block->getLevel() < refinementDepth_) { block->setMarker(true); } + } + } + } +}; + +class LDC +{ + public: + LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); } + RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + Block& b = dynamic_cast< Block& >(*bIt); + const uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + const Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; +}; + +static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) +{ + Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize"); + Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks"); + Vector3< bool > periodic = domainSetup.getParameter< Vector3< bool > >("periodic"); + + auto refSelection = ldcSetup.refinementSelector(); + setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection)); + const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); + setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment ); + setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), numProcesses); +} + +int main(int argc, char** argv) +{ + const mpi::Environment env(argc, argv); + mpi::MPIManager::instance()->useWorldComm(); + + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) + { + WALBERLA_MPI_WORLD_BARRIER() + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// SETUP AND CONFIGURATION /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto config = *cfg; + logging::configureLogging(config); + auto domainSetup = config->getOneBlock("DomainSetup"); + + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); + const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false); + const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false); + const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses"); + + auto ldc = std::make_shared< LDC >(refinementDepth); + SetupBlockForest setupBfs; + if (writeSetupForestAndReturn) + { + WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes") + WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") + createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses); + + WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); } + + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks()) + for (uint_t level = 0; level <= refinementDepth; level++) + { + const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level); + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks) + } + + WALBERLA_LOG_INFO_ON_ROOT("Ending program") + return EXIT_SUCCESS; + } + + WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") + createSetupBlockForest(setupBfs, domainSetup, *ldc); + + // Create structured block forest + Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); + WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...") + auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + auto blocks = + std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); + blocks->createCellBoundingBoxes(); + + WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); } + + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks()) + for (uint_t level = 0; level <= refinementDepth; level++) + { + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level)) + } + + // Creating fields + const StorageSpecification_T StorageSpec = StorageSpecification_T(); + const BlockDataID pdfFieldID = + lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx); + const BlockDataID velFieldID = + field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2)); + const BlockDataID densityFieldID = + field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2)); + const BlockDataID flagFieldID = + field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3)); + + const Cell innerOuterSplit = + Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1))); + SweepCollection_T sweepCollection(blocks, pdfFieldID, densityFieldID, velFieldID, omega, innerOuterSplit); + for (auto& block : *blocks) + { + sweepCollection.initialise(&block, 2); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// LB SWEEPS AND BOUNDARY HANDLING /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + const FlagUID fluidFlagUID("Fluid"); + ldc->setupBoundaryFlagField(*blocks, flagFieldID); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 2); + BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldID, fluidFlagUID); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...") + auto communication = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks); + auto packInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldID); + communication->addPackInfo(packInfo); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement( + blocks, pdfFieldID, sweepCollection, boundaryCollection, communication, packInfo); + + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + + if(benchmarkKernelOnly){ + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); + } + else{ + LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); + } + + // VTK + const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "vel"); + vtkOutput->addCellDataWriter(velWriter); + + vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks) + sweepCollection.calculateMacroscopicParameters(&block); + }); + timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// BENCHMARK /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto remainingTimeLoggerFrequency = + parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds + if (remainingTimeLoggerFrequency > 0) + { + auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency); + timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); + } + + lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID); + field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID ); + fluidCells(); + + WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)") + + WcTimingPool timeloopTiming; + WcTimer simTimer; + + WALBERLA_LOG_INFO_ON_ROOT("Starting benchmark with " << timesteps << " time steps") + simTimer.start(); + timeLoop.run(timeloopTiming); + simTimer.end(); + + WALBERLA_LOG_INFO_ON_ROOT("Benchmark finished") + double time = simTimer.max(); + WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); } + performance.logResultOnRoot(timesteps, time); + + const auto reducedTimeloopTiming = timeloopTiming.getReduced(); + WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming) + } + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py new file mode 100644 index 0000000000000000000000000000000000000000..3b350b6c9c48e0418244101cb3de1daec26c34ce --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py @@ -0,0 +1,68 @@ +import sympy as sp + +import pystencils as ps + +from lbmpy.advanced_streaming.utility import get_timesteps +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule +from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil + +from pystencils_walberla import CodeGeneration, generate_info_header +from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator + +omega = sp.symbols("omega") +omega_free = sp.Symbol("omega_free") + +info_header = """ +const char * infoStencil = "{stencil}"; +const char * infoStreamingPattern = "{streaming_pattern}"; +const char * infoCollisionSetup = "{collision_setup}"; +const bool infoCseGlobal = {cse_global}; +const bool infoCsePdfs = {cse_pdfs}; +""" + +with CodeGeneration() as ctx: + field_type = "float64" if ctx.double_accuracy else "float32" + + streaming_pattern = 'pull' + timesteps = get_timesteps(streaming_pattern) + stencil = LBStencil(Stencil.D3Q19) + + assert stencil.D == 3, "This application supports only three-dimensional stencils" + pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') + density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') + macroscopic_fields = {'density': density_field, 'velocity': velocity_field} + + lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, + streaming_pattern=streaming_pattern) + lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx") + + method = create_lb_method(lbm_config=lbm_config) + collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) + + no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', + boundary_object=NoSlip()) + ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', + boundary_object=UBB([0.05, 0, 0], data_type=field_type)) + + generate_lbm_package(ctx, name="NonUniformGridCPU", + collision_rule=collision_rule, + lbm_config=lbm_config, lbm_optimisation=lbm_opt, + nonuniform=True, boundaries=[no_slip, ubb], + macroscopic_fields=macroscopic_fields, + target=ps.Target.CPU) + + infoHeaderParams = { + 'stencil': stencil.name.lower(), + 'streaming_pattern': streaming_pattern, + 'collision_setup': lbm_config.method.name.lower(), + 'cse_global': int(lbm_opt.cse_global), + 'cse_pdfs': int(lbm_opt.cse_pdfs), + } + + field_typedefs = {'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} + + generate_info_header(ctx, 'NonUniformGridCPUInfoHeader', + field_typedefs=field_typedefs, + additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..1de18d9f0f6ed8ef684eddee74f4712c8f72c852 --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py @@ -0,0 +1,57 @@ +import waLBerla as wlb + + +class Scenario: + def __init__(self, domain_size=(32, 32, 32), root_blocks=(2, 2, 2), + cells_per_block=(16, 16, 16)): + + self.domain_size = domain_size + self.root_blocks = root_blocks + self.cells_per_block = cells_per_block + + self.periodic = (0, 0, 0) + + self.config_dict = self.config(print_dict=False) + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'DomainSetup': { + 'domainSize': self.domain_size, + 'rootBlocks': self.root_blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic + }, + 'Parameters': { + 'omega': 1.95, + 'timesteps': 101, + + 'refinementDepth': 1, + 'writeSetupForestAndReturn': False, + 'numProcesses': 1, + + 'benchmarkKernelOnly': False, + + 'remainingTimeLoggerFrequency': 3, + + 'vtkWriteFrequency': 50, + } + } + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + scenario = Scenario() + scenarios.add(scenario) + + +validation_run() diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6840007e14d5f5af685bb5b262c8bcfd6138d6e --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt @@ -0,0 +1,15 @@ +waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.py" ) +waLBerla_link_files_to_builddir( "simulation_setup" ) + +waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated + FILE NonUniformGridGPU.py + OUT_FILES NonUniformGridGPUStorageSpecification.h NonUniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX} + NonUniformGridGPUSweepCollection.h NonUniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX} + NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX} + UBB.h UBB.${CODEGEN_FILE_SUFFIX} + NonUniformGridGPUBoundaryCollection.h + NonUniformGridGPUInfoHeader.h) +waLBerla_add_executable( NAME NonUniformGridGPU + FILES NonUniformGridGPU.cpp + DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated ) \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa3905b4236295275d82e2e4aad91be4ddcbb5ba --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -0,0 +1,361 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonUniformGridGPU.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" +#include "blockforest/SetupBlockForest.h" +#include "blockforest/loadbalancing/StaticCurve.h" + +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" + +#include "field/AddToStorage.h" +#include "field/FlagField.h" +#include "field/vtk/VTKWriter.h" + +#include "geometry/InitBoundaryHandling.h" + +#include "gpu/AddGPUFieldToStorage.h" +#include "gpu/DeviceSelectMPI.h" +#include "gpu/FieldCopy.h" +#include "gpu/ErrorChecking.h" +#include "gpu/HostFieldAllocator.h" +#include "gpu/ParallelStreams.h" +#include "gpu/communication/NonUniformGPUScheme.h" + +#include "lbm_generated/evaluation/PerformanceEvaluation.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h" +#include "lbm_generated/gpu/GPUPdfField.h" +#include "lbm_generated/gpu/AddToStorage.h" +#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/DictWrapper.h" +#include "python_coupling/PythonCallback.h" + +#include "timeloop/SweepTimeloop.h" + +#include <cmath> + +#include "NonUniformGridGPUInfoHeader.h" +using namespace walberla; + +using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification; +using Stencil_T = StorageSpecification_T::Stencil; +using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil; + +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; +using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; +using FlagField_T = FlagField< uint8_t >; +using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField_T >; + +using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection; + +using gpu::communication::NonUniformGPUScheme; +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; + +class LDCRefinement +{ + private: + const uint_t refinementDepth_; + + public: + LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) + { + std::vector< SetupBlock* > blocks; + forest.getBlocks(blocks); + + for (auto block : blocks) + { + if (forest.atDomainYMaxBorder(*block)) + { + if (block->getLevel() < refinementDepth_) { block->setMarker(true); } + } + } + } +}; + +class LDC +{ + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; + + public: + LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); } + RefinementSelectionFunctor refinementSelector() + { + return LDCRefinement(refinementDepth_); + } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + Block& b = dynamic_cast< Block& >(*bIt); + const uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + const Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } +}; + +static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) +{ + Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize"); + Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks"); + Vector3< bool > periodic = domainSetup.getParameter< Vector3< bool > >("periodic"); + + auto refSelection = ldcSetup.refinementSelector(); + setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection)); + const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); + setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment ); + setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); +} + +int main(int argc, char** argv) +{ + const mpi::Environment env(argc, argv); + mpi::MPIManager::instance()->useWorldComm(); + gpu::selectDeviceBasedOnMpiRank(); + + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) + { + WALBERLA_MPI_WORLD_BARRIER() + + WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// SETUP AND CONFIGURATION /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto config = *cfg; + logging::configureLogging(config); + auto domainSetup = config->getOneBlock("DomainSetup"); + Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); + const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false); + const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false); + const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false); + const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses"); + + auto ldc = std::make_shared< LDC >(refinementDepth ); + SetupBlockForest setupBfs; + if (writeSetupForestAndReturn) + { + WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes") + WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") + createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses); + + WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); } + + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks()) + uint_t totalCellUpdates( 0.0 ); + for (uint_t level = 0; level <= refinementDepth; level++) + { + const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level); + const uint_t numberOfCells = numberOfBlocks * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]; + totalCellUpdates += timesteps * math::uintPow2(level) * numberOfCells; + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks) + } + cudaDeviceProp prop; + WALBERLA_GPU_CHECK(gpuGetDeviceProperties(&prop, 0)) + + const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]; + + const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q; + const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE); + const uint_t sizePerValue = sizeof(PdfField_T::value_type); + const double totalGPUMem = double_c(prop.totalGlobalMem) * 1e-9; + const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9; + + WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)") + WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB") + WALBERLA_LOG_INFO_ON_ROOT( "The total cell updates after " << timesteps << " timesteps (on the coarse level) will be " << totalCellUpdates) + WALBERLA_LOG_INFO_ON_ROOT( "Total GPU memory " << totalGPUMem) + + WALBERLA_LOG_INFO_ON_ROOT("Ending program") + return EXIT_SUCCESS; + } + + WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") + createSetupBlockForest(setupBfs, domainSetup, *ldc); + + // Create structured block forest + WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...") + auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); + blocks->createCellBoundingBoxes(); + + WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); } + + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks()) + for (uint_t level = 0; level <= refinementDepth; level++) + { + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level)) + } + + WALBERLA_LOG_INFO_ON_ROOT("Start field allocation") + // Creating fields + const StorageSpecification_T StorageSpec = StorageSpecification_T(); + auto allocator = make_shared< gpu::HostFieldAllocator<real_t> >(); + const BlockDataID pdfFieldCpuID = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx, allocator); + const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2), allocator); + const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2), allocator); + const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3)); + + const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true); + const BlockDataID velFieldGpuID = + gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true); + const BlockDataID densityFieldGpuID = + gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true); + WALBERLA_LOG_INFO_ON_ROOT("Finished field allocation") + + const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1))); + Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1)); + SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit); + for (auto& iBlock : *blocks) + { + sweepCollection.initialise(&iBlock, 2, nullptr); + } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// LB SWEEPS AND BOUNDARY HANDLING /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + const FlagUID fluidFlagUID("Fluid"); + ldc->setupBoundaryFlagField(*blocks, flagFieldID); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 2); + BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...") + auto communication = std::make_shared< NonUniformGPUScheme <CommunicationStencil_T>> (blocks, cudaEnabledMPI); + auto packInfo = lbm_generated::setupNonuniformGPUPdfCommunication<GPUPdfField_T>(blocks, pdfFieldGpuID); + communication->addPackInfo(packInfo); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + int streamHighPriority = 0; + int streamLowPriority = 0; + WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority)) + sweepCollection.setOuterPriority(streamHighPriority); + auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority); + + lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo); + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + + // LBMMeshRefinement.test(5); + // return EXIT_SUCCESS; + + if(benchmarkKernelOnly){ + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); + } + else{ + LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); + } + + // VTK + const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel"); + vtkOutput->addCellDataWriter(velWriter); + + vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks) + sweepCollection.calculateMacroscopicParameters(&block); + gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); + }); + timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// BENCHMARK /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto remainingTimeLoggerFrequency = + parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds + if (remainingTimeLoggerFrequency > 0) + { + auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency); + timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); + } + + lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID); + field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID ); + fluidCells(); + + WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)") + + WcTimingPool timeloopTiming; + WcTimer simTimer; + + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") + simTimer.start(); + timeLoop.run(timeloopTiming); + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + simTimer.end(); + + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") + double time = simTimer.max(); + WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); } + performance.logResultOnRoot(timesteps, time); + + const auto reducedTimeloopTiming = timeloopTiming.getReduced(); + WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming) + } + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py new file mode 100644 index 0000000000000000000000000000000000000000..d523b5c0c1b8dfcbfa1cf112c0342edfdee03c7d --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py @@ -0,0 +1,79 @@ +import sympy as sp +import numpy as np + +import pystencils as ps +from pystencils.typing import TypedSymbol + +from lbmpy.advanced_streaming.utility import get_timesteps +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule +from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil + +from pystencils_walberla import CodeGeneration, generate_info_header +from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator + +omega = sp.symbols("omega") +omega_free = sp.Symbol("omega_free") +compile_time_block_size = False +max_threads = 256 + +sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + TypedSymbol("cudaBlockSize2", np.int32)) + +gpu_indexing_params = {'block_size': sweep_block_size} + +info_header = """ +const char * infoStencil = "{stencil}"; +const char * infoStreamingPattern = "{streaming_pattern}"; +const char * infoCollisionSetup = "{collision_setup}"; +const bool infoCseGlobal = {cse_global}; +const bool infoCsePdfs = {cse_pdfs}; +""" + +with CodeGeneration() as ctx: + field_type = "float64" if ctx.double_accuracy else "float32" + + streaming_pattern = 'pull' + timesteps = get_timesteps(streaming_pattern) + stencil = LBStencil(Stencil.D3Q19) + + assert stencil.D == 3, "This application supports only three-dimensional stencils" + pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') + density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') + macroscopic_fields = {'density': density_field, 'velocity': velocity_field} + + lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, + streaming_pattern=streaming_pattern) + lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx') + + method = create_lb_method(lbm_config=lbm_config) + collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) + + no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', + boundary_object=NoSlip()) + ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', + boundary_object=UBB([0.05, 0, 0], data_type=field_type)) + + generate_lbm_package(ctx, name="NonUniformGridGPU", + collision_rule=collision_rule, + lbm_config=lbm_config, lbm_optimisation=lbm_opt, + nonuniform=True, boundaries=[no_slip, ubb], + macroscopic_fields=macroscopic_fields, + target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params, + max_threads=max_threads) + + infoHeaderParams = { + 'stencil': stencil.name.lower(), + 'streaming_pattern': streaming_pattern, + 'collision_setup': lbm_config.method.name.lower(), + 'cse_global': int(lbm_opt.cse_global), + 'cse_pdfs': int(lbm_opt.cse_pdfs), + } + + field_typedefs = {'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} + + generate_info_header(ctx, 'NonUniformGridGPUInfoHeader', + field_typedefs=field_typedefs, + additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..d05852fd1934c71ea67d6cce3a8ae3f4cc80e61a --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -0,0 +1,66 @@ +import waLBerla as wlb + + +class Scenario: + def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2), + cells_per_block=(32, 32, 32), refinement_depth=0): + + self.domain_size = domain_size + self.root_blocks = root_blocks + self.cells_per_block = cells_per_block + self.refinement_depth = refinement_depth + + self.periodic = (0, 0, 0) + + self.config_dict = self.config(print_dict=False) + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'DomainSetup': { + 'domainSize': self.domain_size, + 'rootBlocks': self.root_blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic + }, + 'Parameters': { + 'omega': 1.95, + 'timesteps': 1501, + + 'refinementDepth': self.refinement_depth, + 'writeSetupForestAndReturn': False, + 'numProcesses': 1, + + 'cudaEnabledMPI': False, + 'benchmarkKernelOnly': False, + + 'remainingTimeLoggerFrequency': 3, + + 'vtkWriteFrequency': 500, + } + } + + if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + + domain_size = (64, 64, 64) + cells_per_block = (32, 32, 32) + + root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) + + scenarios = wlb.ScenarioManager() + scenario = Scenario(domain_size=domain_size, + root_blocks=root_blocks, + cells_per_block=cells_per_block, + refinement_depth=1) + scenarios.add(scenario) + + +validation_run() diff --git a/apps/benchmarks/UniformGridCPU/CMakeLists.txt b/apps/benchmarks/UniformGridCPU/CMakeLists.txt index a2f06826e40553f9c157c5b5e5200ba8ed2b26b2..0d159bc542c6ada48999dace8e2b7dce4a085519 100644 --- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt @@ -15,13 +15,11 @@ foreach(streaming_pattern pull push aa esotwist) waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config} FILE UniformGridCPU.py CODEGEN_CFG ${config} - OUT_FILES UniformGridCPU_LbKernel.cpp UniformGridCPU_LbKernel.h - UniformGridCPU_PackInfoEven.cpp UniformGridCPU_PackInfoEven.h - UniformGridCPU_PackInfoOdd.cpp UniformGridCPU_PackInfoOdd.h - UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h - UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h - UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h - UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h + OUT_FILES UniformGridCPUStorageSpecification.h UniformGridCPUStorageSpecification.cpp + UniformGridCPUSweepCollection.h UniformGridCPUSweepCollection.cpp + NoSlip.cpp NoSlip.h + UBB.cpp UBB.h + UniformGridCPUBoundaryCollection.h UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h UniformGridCPU_InfoHeader.h ) diff --git a/apps/benchmarks/UniformGridCPU/InitShearVelocity.h b/apps/benchmarks/UniformGridCPU/InitShearVelocity.h index 9a6c7d1db63a5a7ad53376ed33f56851a806dafe..fd13a03b6d89ed30007969ecb2b77f27d015d8a6 100644 --- a/apps/benchmarks/UniformGridCPU/InitShearVelocity.h +++ b/apps/benchmarks/UniformGridCPU/InitShearVelocity.h @@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); - real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); velField->get(x, y, z, 1) = real_t(0); velField->get(x, y, z, 2) = randomReal; diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp index 3b4a77a570ad86d2adc95789f0a58cda3a3dd4e9..64d94ce3d0dd843b29e693d446485bac73b84119 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp @@ -34,7 +34,10 @@ #include "geometry/InitBoundaryHandling.h" -#include "lbm/communication/CombinedInPlaceCpuPackInfo.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h" +#include "lbm_generated/evaluation/PerformanceEvaluation.h" #include "python_coupling/CreateConfig.h" #include "python_coupling/DictWrapper.h" @@ -50,21 +53,20 @@ using namespace walberla; -using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven; -using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd; -using LbSweep = lbm::UniformGridCPU_LbKernel; +using StorageSpecification_T = lbm::UniformGridCPUStorageSpecification; +using Stencil_T = lbm::UniformGridCPUStorageSpecification::Stencil; +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; using FlagField_T = FlagField< uint8_t >; +using BoundaryCollection_T = lbm::UniformGridCPUBoundaryCollection< FlagField_T >; -auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) { - return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block), - storage->getNumberOfZCells(*block), uint_t(1), field::fzyx, - make_shared< field::AllocateAligned< real_t, 64 > >()); -}; +using SweepCollection_T = lbm::UniformGridCPUSweepCollection; + +using blockforest::communication::UniformBufferedScheme; int main(int argc, char** argv) { - mpi::Environment const env(argc, argv); + const mpi::Environment env(argc, argv); for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { @@ -74,8 +76,6 @@ int main(int argc, char** argv) logging::configureLogging(config); auto blocks = blockforest::createUniformBlockGridFromConfig(config); - Vector3< uint_t > cellsPerBlock = - config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); // Reading parameters auto parameters = config->getOneBlock("Parameters"); const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); @@ -83,9 +83,12 @@ int main(int argc, char** argv) const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true); // Creating fields - BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs"); - BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); - BlockDataID const densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx); + const StorageSpecification_T StorageSpec = StorageSpecification_T(); + auto fieldAllocator = make_shared< field::AllocateAligned< real_t, 64 > >(); + const BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator); + const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); + const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx); + const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); // Initialize velocity on cpu if (initShearFlow) @@ -94,157 +97,76 @@ int main(int argc, char** argv) initShearVelocity(blocks, velFieldId); } - pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId); - pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId); + const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1))); + SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega, innerOuterSplit); - // Set up initial PDF values for (auto& block : *blocks) - setterSweep(&block); - - Vector3< int > innerOuterSplit = - parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1)); - - for (uint_t i = 0; i < 3; ++i) { - if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) - { - WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") - } + sweepCollection.initialise(&block); } - Cell const innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); - LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell); - pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId); + const pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId); // Boundaries const FlagUID fluidFlagUID("Fluid"); - BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); auto boundariesConfig = config->getBlock("Boundaries"); - bool boundaries = false; if (boundariesConfig) { WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions") - boundaries = true; geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig); - geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); } - - lbm::UniformGridCPU_NoSlip noSlip(blocks, pdfFieldId); - noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); - - lbm::UniformGridCPU_UBB ubb(blocks, pdfFieldId); - ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); + BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldId, fluidFlagUID); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// COMMUNICATION SCHEME /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Initial setup is the post-collision state of an even time step - auto tracker = make_shared< lbm::TimestepTracker >(0); - auto packInfo = - make_shared< lbm::CombinedInPlaceCpuPackInfo< PackInfoEven_T , PackInfoOdd_T > >(tracker, pdfFieldId); - - blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); + auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId); + UniformBufferedScheme< Stencil_T > communication(blocks); communication.addPackInfo(packInfo); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME STEP DEFINITIONS /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); - auto boundarySweep = [&](IBlock* block, uint8_t t) { - noSlip.run(block, t); - ubb.run(block, t); - }; - - auto boundaryInner = [&](IBlock* block, uint8_t t) { - noSlip.inner(block, t); - ubb.inner(block, t); - }; - - auto boundaryOuter = [&](IBlock* block, uint8_t t) { - noSlip.outer(block, t); - ubb.outer(block, t); - }; - - auto simpleOverlapTimeStep = [&]() { - // Communicate post-collision values of previous timestep... - communication.startCommunication(); - for (auto& block : *blocks) - { - if (boundaries) boundaryInner(&block, tracker->getCounter()); - lbSweep.inner(&block, tracker->getCounterPlusOne()); - } - communication.wait(); - for (auto& block : *blocks) - { - if (boundaries) boundaryOuter(&block, tracker->getCounter()); - lbSweep.outer(&block, tracker->getCounterPlusOne()); - } - - tracker->advance(); - }; - - auto normalTimeStep = [&]() { - communication.communicate(); - for (auto& block : *blocks) - { - if (boundaries) boundarySweep(&block, tracker->getCounter()); - lbSweep(&block, tracker->getCounterPlusOne()); - } - - tracker->advance(); - }; - - // With two-fields patterns, ghost layer cells act as constant stream-in boundaries; - // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries. - auto kernelOnlyFunc = [&]() { - tracker->advance(); - for (auto& block : *blocks) - lbSweep(&block, tracker->getCounter()); - }; - - // Stream only function to test a streaming pattern without executing lbm operations inside - auto StreamOnlyFunc = [&]() { - for (auto& block : *blocks) - StreamOnlyKernel(&block); - }; + if (timeStepStrategy == "noOverlap") { + if (boundariesConfig){ + timeLoop.add() << BeforeFunction(communication, "communication") + << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions"); + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); + }else { + timeLoop.add() << BeforeFunction(communication, "communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");} + + } else if (timeStepStrategy == "simpleOverlap") { + if (boundariesConfig){ + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") + << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions"); + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame"); + timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame"); + }else{ + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame"); + timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");} + + } else if (timeStepStrategy == "kernelOnly") { + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); + } else if (timeStepStrategy == "StreamOnly") { + timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only"); + } else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'") + } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME LOOP SETUP /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); - - const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); - std::function< void() > timeStep; - if (timeStepStrategy == "noOverlap") - timeStep = std::function< void() >(normalTimeStep); - else if (timeStepStrategy == "simpleOverlap") - timeStep = simpleOverlapTimeStep; - else if (timeStepStrategy == "kernelOnly") - { - WALBERLA_LOG_INFO_ON_ROOT( - "Running only compute kernel without boundary - this makes only sense for benchmarking!") - // Run initial communication once to provide any missing stream-in populations - communication.communicate(); - timeStep = kernelOnlyFunc; - } - else if (timeStepStrategy == "StreamOnly") - { - WALBERLA_LOG_INFO_ON_ROOT( - "Running only streaming kernel without LBM - this makes only sense for benchmarking!") - // Run initial communication once to provide any missing stream-in populations - timeStep = StreamOnlyFunc; - } - else - { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " - "'simpleOverlap', 'kernelOnly'") - } - - timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step"); - - uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); if (vtkWriteFrequency > 0) { auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", @@ -254,7 +176,7 @@ int main(int argc, char** argv) vtkOutput->addBeforeFunction([&]() { for (auto& block : *blocks){ - getterSweep(&block);} + sweepCollection.calculateMacroscopicParameters(&block);} }); timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); @@ -263,46 +185,50 @@ int main(int argc, char** argv) ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// BENCHMARK /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID); - int const warmupSteps = parameters.getParameter< int >("warmupSteps", 2); - int const outerIterations = parameters.getParameter< int >("outerIterations", 1); - for (int i = 0; i < warmupSteps; ++i) + const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_c(2)); + const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1)); + for (uint_t i = 0; i < warmupSteps; ++i) timeLoop.singleStep(); - real_t const remainingTimeLoggerFrequency = + auto remainingTimeLoggerFrequency = parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds if (remainingTimeLoggerFrequency > 0) { - auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), + auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency); timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); } - for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) + for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration) { timeLoop.setCurrentTimeStepToZero(); + + WcTimingPool timeloopTiming; WcTimer simTimer; + + WALBERLA_MPI_WORLD_BARRIER() WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") + simTimer.start(); - timeLoop.run(); + timeLoop.run(timeloopTiming); simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") - auto time = real_c(simTimer.last()); - WALBERLA_MPI_SECTION() - { - walberla::mpi::reduceInplace(time, walberla::mpi::MAX); - } - auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); + double time = simTimer.max(); + WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); } + performance.logResultOnRoot(timesteps, time); + + const auto reducedTimeloopTiming = timeloopTiming.getReduced(); + WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming) - auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess) - WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps)) WALBERLA_ROOT_SECTION() { python_coupling::PythonCallback pythonCallbackResults("results_callback"); if (pythonCallbackResults.isCallable()) { - pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess); + pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); pythonCallbackResults.data().exposeValue("stencil", infoStencil); pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py index cba55fac4675c18d8f25f10541de2138002b1208..cd1a36114788a0ad440f89d750abc8af26109eda 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py @@ -6,19 +6,17 @@ import pystencils as ps from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\ insert_symbol_times_minus_one -from lbmpy.advanced_streaming import Timestep, is_inplace +from lbmpy.advanced_streaming import is_inplace from lbmpy.advanced_streaming.utility import streaming_patterns from lbmpy.boundaries import NoSlip, UBB from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule from lbmpy.enums import Method, Stencil from lbmpy.fieldaccess import CollideOnlyInplaceAccessor -from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter +from lbmpy.moments import get_default_moment_set_for_stencil from lbmpy.updatekernels import create_stream_only_kernel -from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\ - generate_mpidtype_info_from_kernel, generate_info_header - -from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary, generate_lb_pack_info +from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep +from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator omega = sp.symbols('omega') omega_free = sp.Symbol('omega_free') @@ -121,15 +119,17 @@ with CodeGeneration() as ctx: options = options_dict[collision_setup] - q = stencil.Q - dim = stencil.D - assert dim == 3, "This app supports only three-dimensional stencils" - pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx') + assert stencil.D == 3, "This application supports only three-dimensional stencils" + pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') + macroscopic_fields = {'density': density_field, 'velocity': velocity_field} lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx') + if lbm_config.method == Method.CENTRAL_MOMENT: + lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil)) + if not is_inplace(streaming_pattern): lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp) field_swaps = [(pdfs, pdfs_tmp)] @@ -153,46 +153,22 @@ with CodeGeneration() as ctx: collision_rule = insert_aliases(collision_rule) collision_rule = insert_symbol_times_minus_one(collision_rule) - lb_method = collision_rule.method - - generate_alternating_lbm_sweep(ctx, 'UniformGridCPU_LbKernel', collision_rule, lbm_config=lbm_config, - lbm_optimisation=lbm_opt, target=ps.Target.CPU, - inner_outer_split=True, field_swaps=field_swaps, - cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) - - # getter & setter - setter_assignments = macroscopic_values_setter(lb_method, - density=density_field.center, velocity=velocity_field.center_vector, - pdfs=pdfs, - streaming_pattern=streaming_pattern, - previous_timestep=Timestep.EVEN) - getter_assignments = macroscopic_values_getter(lb_method, - density=density_field, velocity=velocity_field, - pdfs=pdfs, - streaming_pattern=streaming_pattern, - previous_timestep=Timestep.EVEN) - - generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU, cpu_openmp=openmp) - generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU, cpu_openmp=openmp) + no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', + boundary_object=NoSlip()) + ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', + boundary_object=UBB([0.05, 0, 0], data_type=field_type)) + + generate_lbm_package(ctx, name="UniformGridCPU", + collision_rule=collision_rule, + lbm_config=lbm_config, lbm_optimisation=lbm_opt, + nonuniform=False, boundaries=[no_slip, ubb], + macroscopic_fields=macroscopic_fields, + cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) # Stream only kernel generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, target=ps.Target.CPU, cpu_openmp=openmp) - # Boundaries - noslip = NoSlip() - ubb = UBB((0.05, 0, 0), data_type=field_type) - - generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_NoSlip', noslip, lb_method, field_name=pdfs.name, - streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp) - generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_UBB', ubb, lb_method, field_name=pdfs.name, - streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp) - - # communication - generate_lb_pack_info(ctx, 'UniformGridCPU_PackInfo', stencil, pdfs, - streaming_pattern=streaming_pattern, target=ps.Target.CPU, - always_generate_separate_classes=True) - infoHeaderParams = { 'stencil': stencil_str, 'streaming_pattern': streaming_pattern, @@ -201,13 +177,10 @@ with CodeGeneration() as ctx: 'cse_pdfs': int(lbm_opt.cse_pdfs), } - stencil_typedefs = {'Stencil_T': stencil, - 'CommunicationStencil_T': stencil} - field_typedefs = {'PdfField_T': pdfs, - 'VelocityField_T': velocity_field, + field_typedefs = {'VelocityField_T': velocity_field, 'ScalarField_T': density_field} # Info header containing correct template definitions for stencil and field generate_info_header(ctx, 'UniformGridCPU_InfoHeader', - stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs, + field_typedefs=field_typedefs, additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py b/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py new file mode 100644 index 0000000000000000000000000000000000000000..3c4aa08ec2c2328be7d102d4f377a2cd754dc8af --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +import os +from waLBerla.tools.config import block_decomposition + + +job_script_header = """ +#!/bin/bash -l +#SBATCH --job-name=scaling +#SBATCH --time=01:00:00 +#SBATCH --nodes={nodes} +#SBATCH -o out_scaling_{nodes}_%j.txt +#SBATCH -e err_scaling_{nodes}_%j.txt +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --account=s1042 + +source ~/env.sh + +export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer +export CRAY_CUDA_MPS=1 # allow GPU sharing +export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export CRAY_CUDA_MPS=1 + +export MPICH_RANK_REORDER_METHOD=3 +export PMI_MMAP_SYNC_WAIT_TIME=300 + +cd {folder} +# grid_order -R -H -c 1,1,8 -g 16,16,8 + +ulimit -c 0 +""" + +job_script_exe_part = """ + +export WALBERLA_SCENARIO_IDX=0 +while srun -n {nodes} ./{app} {config} +do + ((WALBERLA_SCENARIO_IDX++)) +done +""" + +streaming_patterns = ['pull', 'push', 'aa', 'esotwist'] +stencils = ['d3q27', 'd3q19'] +methods = ['srt', 'mrt', 'cumulant', 'entropic'] + +all_executables = [] + +for stencil in stencils: + for streaming_pattern in streaming_patterns: + for method in methods: + all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}") + +all_executables = tuple(all_executables) + + +def generate_jobscripts(exe_names=all_executables): + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: + folder_name = "scaling_{:04d}".format(node_count) + os.makedirs(folder_name, exist_ok=True) + + # run grid_order + import subprocess + decomposition = block_decomposition(node_count) + decomposition_str = ",".join(str(e) for e in decomposition) + subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str]) + + job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name)) + for exe in exe_names: + job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, + config='../communication_compare.py') + + with open(os.path.join(folder_name, 'job.sh'), 'w') as f: + f.write(job_script) + + +if __name__ == '__main__': + print("Called without waLBerla - generating job scripts for PizDaint") + generate_jobscripts() diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py index f432e778bc8e7d5c82120db40469ed7d2f2aa7ed..9acab66da85c8f5477251e66bc7a9ea37ccc2fd7 100755 --- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -9,13 +9,15 @@ from math import prod # Number of time steps run for a workload of 128^3 per process # if double as many cells are on the process, half as many time steps are run etc. # increase this to get more reliable measurements -TIME_STEPS_FOR_128_BLOCK = 5 +TIME_STEPS_FOR_128_BLOCK = 10 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3") def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK): cells = block_size[0] * block_size[1] * block_size[2] time_steps = (128 ** 3 / cells) * time_steps_for_128_block + if time_steps < TIME_STEPS_FOR_128_BLOCK: + time_steps = 5 return int(time_steps) @@ -39,7 +41,7 @@ class Scenario: init_shear_flow = False periodic = (0, 0, 0) - self.blocks = block_decomposition(wlb.mpi.numProcesses()) + self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses()) self.cells_per_block = cells_per_block self.periodic = periodic @@ -66,6 +68,7 @@ class Scenario: 'blocks': self.blocks, 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic, + 'oneBlockPerProcess': False }, 'Parameters': { 'omega': self.omega, @@ -176,6 +179,7 @@ def single_node_benchmark(): for block_size in block_sizes: scenario = Scenario(cells_per_block=block_size, time_step_strategy='kernelOnly', + outer_iterations=1, timesteps=num_time_steps(block_size)) scenarios.add(scenario) @@ -185,26 +189,26 @@ def validation_run(): wlb.log_info_on_root("Validation run") wlb.log_info_on_root("") - time_step_strategy = 'simpleOverlap' # 'noOverlap' + time_step_strategy = "noOverlap" # "noOverlap" scenarios = wlb.ScenarioManager() scenario = Scenario(cells_per_block=(64, 64, 64), time_step_strategy=time_step_strategy, - timesteps=101, + timesteps=201, outer_iterations=1, warmup_steps=0, - init_shear_flow=True, - boundary_setup=False, - vtk_write_frequency=100, + init_shear_flow=False, + boundary_setup=True, + vtk_write_frequency=50, remaining_time_logger_frequency=10) scenarios.add(scenario) wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") # Select the benchmark you want to run -single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU +# single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU # performance of compute kernel (no communication) # overlap_benchmark() # benchmarks different communication overlap options # profiling() # run only two timesteps on a smaller domain for profiling only -# validation_run() +validation_run() # scaling_benchmark() diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index b1f74c57130935614f1e86c71d31a003afc27b7a..66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -14,13 +14,12 @@ foreach(streaming_pattern pull push aa esotwist) waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} FILE UniformGridGPU.py CODEGEN_CFG ${config} - OUT_FILES UniformGridGPU_LbKernel.${CODEGEN_FILE_SUFFIX} UniformGridGPU_LbKernel.h - UniformGridGPU_PackInfoEven.${CODEGEN_FILE_SUFFIX} UniformGridGPU_PackInfoEven.h - UniformGridGPU_PackInfoOdd.${CODEGEN_FILE_SUFFIX} UniformGridGPU_PackInfoOdd.h - UniformGridGPU_NoSlip.${CODEGEN_FILE_SUFFIX} UniformGridGPU_NoSlip.h - UniformGridGPU_UBB.${CODEGEN_FILE_SUFFIX} UniformGridGPU_UBB.h - UniformGridGPU_MacroSetter.${CODEGEN_FILE_SUFFIX} UniformGridGPU_MacroSetter.h - UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX} UniformGridGPU_StreamOnlyKernel.h + OUT_FILES UniformGridGPUStorageSpecification.h UniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX} + UniformGridGPUSweepCollection.h UniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX} + NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX} + UBB.h UBB.${CODEGEN_FILE_SUFFIX} + UniformGridGPUBoundaryCollection.h + UniformGridGPU_StreamOnlyKernel.h UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX} UniformGridGPU_InfoHeader.h ) diff --git a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h index 9a6c7d1db63a5a7ad53376ed33f56851a806dafe..fd13a03b6d89ed30007969ecb2b77f27d015d8a6 100644 --- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h +++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h @@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); - real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); velField->get(x, y, z, 1) = real_t(0); velField->get(x, y, z, 2) = randomReal; diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 7a3885d3b686d0967f7e7825ea109b8051393309..ee022f457738fb6f8aa71f615441e9279fd25eca 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -29,12 +29,24 @@ #include "field/AddToStorage.h" #include "field/FlagField.h" -#include "field/communication/PackInfo.h" #include "field/vtk/VTKWriter.h" #include "geometry/InitBoundaryHandling.h" -#include "lbm/inplace_streaming/TimestepTracker.h" +#include "gpu/AddGPUFieldToStorage.h" +#include "gpu/DeviceSelectMPI.h" +#include "gpu/FieldCopy.h" +#include "gpu/GPUWrapper.h" +#include "gpu/HostFieldAllocator.h" +#include "gpu/ParallelStreams.h" +#include "gpu/communication/UniformGPUScheme.h" + +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h" +#include "lbm_generated/gpu/GPUPdfField.h" +#include "lbm_generated/gpu/AddToStorage.h" +#include "lbm_generated/evaluation/PerformanceEvaluation.h" #include "python_coupling/CreateConfig.h" #include "python_coupling/DictWrapper.h" @@ -46,16 +58,20 @@ #include "InitShearVelocity.h" #include "UniformGridGPU_InfoHeader.h" -#include "gpu/AddGPUFieldToStorage.h" -#include "gpu/DeviceSelectMPI.h" -#include "gpu/FieldCopy.h" -#include "gpu/GPUWrapper.h" -#include "gpu/ParallelStreams.h" -#include "gpu/communication/UniformGPUScheme.h" -#include "gpu/lbm/CombinedInPlaceGpuPackInfo.h" + using namespace walberla; +using StorageSpecification_T = lbm::UniformGridGPUStorageSpecification; +using Stencil_T = lbm::UniformGridGPUStorageSpecification::Stencil; + +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; +using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >; using FlagField_T = FlagField< uint8_t >; +using BoundaryCollection_T = lbm::UniformGridGPUBoundaryCollection< FlagField_T >; + +using SweepCollection_T = lbm::UniformGridGPUSweepCollection; + +using gpu::communication::UniformGPUScheme; int main(int argc, char** argv) { @@ -76,18 +92,21 @@ int main(int argc, char** argv) logging::configureLogging(config); auto blocks = blockforest::createUniformBlockGridFromConfig(config); - Vector3< uint_t > cellsPerBlock = - config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); // Reading parameters auto parameters = config->getOneBlock("Parameters"); const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true); + const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false); // Creating fields - BlockDataID const pdfFieldCpuID = - field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_c(std::nan("")), field::fzyx); - BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); + const StorageSpecification_T StorageSpec = StorageSpecification_T(); + const BlockDataID pdfFieldCpuID = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx); + + auto allocator = make_shared< gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers + const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(1), allocator); + const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(1), allocator); + const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); // Initialize velocity on cpu if (initShearFlow) @@ -96,181 +115,92 @@ int main(int argc, char** argv) initShearVelocity(blocks, velFieldCpuID); } - BlockDataID const pdfFieldGpuID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true); - // Velocity field is copied to the GPU - BlockDataID velFieldGpuID = + const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true); + const BlockDataID velFieldGpuID = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true); + const BlockDataID densityFieldGpuID = + gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true); - pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID); - - // Set up initial PDF values + const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1))); + Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1)); + SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit); for (auto& block : *blocks) - setterSweep(&block); - - Vector3< int > innerOuterSplit = - parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1)); - - for (uint_t i = 0; i < 3; ++i) { - if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) - { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") } + sweepCollection.initialise(&block); } - Cell const innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); - bool const cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false); - Vector3< int32_t > gpuBlockSize = - parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1)); - int streamHighPriority = 0; int streamLowPriority = 0; WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority)) - + sweepCollection.setOuterPriority(streamHighPriority); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// LB SWEEPS AND BOUNDARY HANDLING /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - using LbSweep = lbm::UniformGridGPU_LbKernel; - using PackInfoEven = lbm::UniformGridGPU_PackInfoEven; - using PackInfoOdd = lbm::UniformGridGPU_PackInfoOdd; - using gpu::communication::UniformGPUScheme; - - LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell); - lbSweep.setOuterPriority(streamHighPriority); - - pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], - gpuBlockSize[2]); + const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); // Boundaries const FlagUID fluidFlagUID("Fluid"); - BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); auto boundariesConfig = config->getBlock("Boundaries"); - bool boundaries = false; if (boundariesConfig) { - boundaries = true; + WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions") geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig); - geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); } - - lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); - noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); - - lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); - ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); - - // Initial setup is the post-collision state of an even time step - auto tracker = make_shared< lbm::TimestepTracker >(0); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); + BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// COMMUNICATION SCHEME /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI); - auto packInfo = - make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); - comm.addPackInfo(packInfo); + UniformGPUScheme< Stencil_T > communication(blocks, cudaEnabledMPI); + auto packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUPdfField_T >>(pdfFieldGpuID); + communication.addPackInfo(packInfo); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME STEP DEFINITIONS /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority); - auto boundarySweep = [&](IBlock* block, uint8_t t, gpuStream_t stream) { - noSlip.run(block, t, stream); - ubb.run(block, t, stream); - }; - - auto boundaryInner = [&](IBlock* block, uint8_t t, gpuStream_t stream) { - noSlip.inner(block, t, stream); - ubb.inner(block, t, stream); - }; - - auto boundaryOuter = [&](IBlock* block, uint8_t t, gpuStream_t stream) { - noSlip.outer(block, t, stream); - ubb.outer(block, t, stream); - }; - - auto simpleOverlapTimeStep = [&]() { - // Communicate post-collision values of previous timestep... - comm.startCommunication(defaultStream); - for (auto& block : *blocks) - { - if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); - lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream); - } - comm.wait(defaultStream); - for (auto& block : *blocks) - { - if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); - lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream); - } - - tracker->advance(); - }; - - auto normalTimeStep = [&]() { - comm.communicate(defaultStream); - for (auto& block : *blocks) - { - if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); - lbSweep(&block, tracker->getCounterPlusOne(), defaultStream); - } - - tracker->advance(); - }; - - // With two-fields patterns, ghost layer cells act as constant stream-in boundaries; - // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries. - auto kernelOnlyFunc = [&]() { - tracker->advance(); - for (auto& block : *blocks) - lbSweep(&block, tracker->getCounter(), defaultStream); - }; - - // Stream only function to test a streaming pattern without executing lbm operations inside - auto StreamOnlyFunc = [&]() { - for (auto& block : *blocks) - StreamOnlyKernel(&block, defaultStream); - }; + if (timeStepStrategy == "noOverlap") { + if (boundariesConfig){ + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide"); + }else { + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");} + + } else if (timeStepStrategy == "simpleOverlap") { + if (boundariesConfig){ + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); + timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame"); + }else{ + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); + timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") + << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");} + + } else if (timeStepStrategy == "kernelOnly") { + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide"); + } else if (timeStepStrategy == "StreamOnly") { + timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only"); + } else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'") + } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME LOOP SETUP /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); - - const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); - std::function< void() > timeStep; - if (timeStepStrategy == "noOverlap") - timeStep = std::function< void() >(normalTimeStep); - else if (timeStepStrategy == "simpleOverlap") - timeStep = simpleOverlapTimeStep; - else if (timeStepStrategy == "kernelOnly") - { - WALBERLA_LOG_INFO_ON_ROOT( - "Running only compute kernel without boundary - this makes only sense for benchmarking!") - // Run initial communication once to provide any missing stream-in populations - comm.communicate(); - timeStep = kernelOnlyFunc; - } - else if (timeStepStrategy == "StreamOnly") - { - WALBERLA_LOG_INFO_ON_ROOT( - "Running only streaming kernel without LBM - this makes only sense for benchmarking!") - // Run initial communication once to provide any missing stream-in populations - timeStep = StreamOnlyFunc; - } - else - { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " - "'simpleOverlap', 'kernelOnly'") - } - - timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step"); - // VTK - uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); if (vtkWriteFrequency > 0) { auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", @@ -278,7 +208,10 @@ int main(int argc, char** argv) auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel"); vtkOutput->addCellDataWriter(velWriter); - vtkOutput->addBeforeFunction([&]() { gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); + vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks) + sweepCollection.calculateMacroscopicParameters(&block); + gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); }); timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); } @@ -287,12 +220,13 @@ int main(int argc, char** argv) /// BENCHMARK /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - int const warmupSteps = parameters.getParameter< int >("warmupSteps", 2); - int const outerIterations = parameters.getParameter< int >("outerIterations", 1); - for (int i = 0; i < warmupSteps; ++i) + lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID); + const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_c(2)); + const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1)); + for (uint_t i = 0; i < warmupSteps; ++i) timeLoop.singleStep(); - real_t const remainingTimeLoggerFrequency = + auto remainingTimeLoggerFrequency = parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds if (remainingTimeLoggerFrequency > 0) { @@ -301,32 +235,36 @@ int main(int argc, char** argv) timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); } - for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) + for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration) { WALBERLA_GPU_CHECK(gpuPeekAtLastError()) timeLoop.setCurrentTimeStepToZero(); + WcTimingPool const timeloopTiming; WcTimer simTimer; + WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) - WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + WALBERLA_GPU_CHECK( gpuPeekAtLastError() ) WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") simTimer.start(); timeLoop.run(); WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") - auto time = real_c(simTimer.last()); - auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); + double time = simTimer.max(); + WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); } + performance.logResultOnRoot(timesteps, time); + + const auto reducedTimeloopTiming = timeloopTiming.getReduced(); + WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming) - auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess) - WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps)) WALBERLA_ROOT_SECTION() { python_coupling::PythonCallback pythonCallbackResults("results_callback"); if (pythonCallbackResults.isCallable()) { - pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess); + pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); pythonCallbackResults.data().exposeValue("stencil", infoStencil); pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); @@ -338,6 +276,5 @@ int main(int argc, char** argv) } } } - return EXIT_SUCCESS; } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index e8fa9906aa1ae005af20f0f77178fb054a528161..3d7579e5bcb3f3713f59a9afd94d7fed790c21e9 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -8,22 +8,21 @@ from pystencils.typing import TypedSymbol from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil -from lbmpy.advanced_streaming import Timestep, is_inplace +from lbmpy.advanced_streaming import is_inplace from lbmpy.advanced_streaming.utility import streaming_patterns from lbmpy.boundaries import NoSlip, UBB from lbmpy.creationfunctions import create_lb_collision_rule -from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from lbmpy.moments import get_default_moment_set_for_stencil from lbmpy.updatekernels import create_stream_only_kernel from lbmpy.fieldaccess import * from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep -from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary +from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator omega = sp.symbols("omega") omega_free = sp.Symbol("omega_free") compile_time_block_size = False -max_threads = None +max_threads = 256 if compile_time_block_size: sweep_block_size = (128, 1, 1) @@ -124,11 +123,10 @@ with CodeGeneration() as ctx: options = options_dict[collision_setup] - q = stencil.Q - dim = stencil.D - assert dim == 3, "This app supports only three-dimensional stencils" - pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]", - layout='fzyx') + assert stencil.D == 3, "This application supports only three-dimensional stencils" + pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') + density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') + macroscopic_fields = {'density': density_field, 'velocity': velocity_field} lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx') @@ -142,12 +140,6 @@ with CodeGeneration() as ctx: else: field_swaps = [] - vp = [ - ('int32_t', 'cudaBlockSize0'), - ('int32_t', 'cudaBlockSize1'), - ('int32_t', 'cudaBlockSize2') - ] - # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. # is_inplace is set to False to ensure that the streaming is done with src and dst field. # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. @@ -165,38 +157,25 @@ with CodeGeneration() as ctx: lb_method = collision_rule.method - generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, lbm_config=lbm_config, - lbm_optimisation=lbm_opt, target=ps.Target.GPU, - gpu_indexing_params=gpu_indexing_params, - inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps, - max_threads=max_threads) + no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', + boundary_object=NoSlip()) + ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', + boundary_object=UBB([0.05, 0, 0], data_type=field_type)) - # getter & setter - setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector, - pdfs=pdfs, - streaming_pattern=streaming_pattern, - previous_timestep=Timestep.EVEN) - generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU, max_threads=max_threads) + generate_lbm_package(ctx, name="UniformGridGPU", + collision_rule=collision_rule, + lbm_config=lbm_config, lbm_optimisation=lbm_opt, + nonuniform=False, boundaries=[no_slip, ubb], + macroscopic_fields=macroscopic_fields, + target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params, + max_threads=max_threads) # Stream only kernel + vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')] generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU, max_threads=max_threads) - # Boundaries - noslip = NoSlip() - ubb = UBB((0.05, 0, 0), data_type=field_type) - - generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name, - streaming_pattern=streaming_pattern, target=ps.Target.GPU) - generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name, - streaming_pattern=streaming_pattern, target=ps.Target.GPU) - - # communication - generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs, - streaming_pattern=streaming_pattern, target=ps.Target.GPU, - always_generate_separate_classes=True) - infoHeaderParams = { 'stencil': stencil_str, 'streaming_pattern': streaming_pattern, @@ -205,12 +184,10 @@ with CodeGeneration() as ctx: 'cse_pdfs': int(lbm_opt.cse_pdfs), } - stencil_typedefs = {'Stencil_T': stencil, - 'CommunicationStencil_T': stencil} - field_typedefs = {'PdfField_T': pdfs, - 'VelocityField_T': velocity_field} + field_typedefs = {'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} # Info header containing correct template definitions for stencil and field generate_info_header(ctx, 'UniformGridGPU_InfoHeader', - stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs, + field_typedefs=field_typedefs, additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index 8de01dacf51ed5e94ac651a5ca61f50988bd3416..531ab22d54ab261ad8f159c91e85c5bfde03360d 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -1,12 +1,3 @@ -#!/usr/bin/env python3 -""" -This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication. -Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1 then run e.g. - ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py - -Look at the end of the file to select the benchmark to run -""" - import os import waLBerla as wlb from waLBerla.tools.config import block_decomposition @@ -34,6 +25,15 @@ BASE_CONFIG = { } } +ldc_setup = {'Border': [ + {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, +]} + def num_time_steps(block_size, time_steps_for_128_block=200): cells = block_size[0] * block_size[1] * block_size[2] @@ -57,10 +57,16 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): class Scenario: def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1), timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False, - inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False, + inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, + init_shear_flow=False, boundary_setup=False, + vtk_write_frequency=0, remaining_time_logger_frequency=-1, additional_info=None): - self.blocks = block_decomposition(wlb.mpi.numProcesses()) + if boundary_setup: + init_shear_flow = False + periodic = (0, 0, 0) + + self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses()) self.cells_per_block = cells_per_block self.periodic = periodic @@ -71,11 +77,13 @@ class Scenario: self.cuda_enabled_mpi = cuda_enabled_mpi self.inner_outer_split = inner_outer_split self.init_shear_flow = init_shear_flow + self.boundary_setup = boundary_setup self.warmup_steps = warmup_steps self.outer_iterations = outer_iterations self.cuda_blocks = cuda_blocks - self.vtk_write_frequency = 0 + self.vtk_write_frequency = vtk_write_frequency + self.remaining_time_logger_frequency = remaining_time_logger_frequency self.config_dict = self.config(print_dict=False) self.additional_info = additional_info @@ -88,6 +96,7 @@ class Scenario: 'blocks': self.blocks, 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic, + 'oneBlockPerProcess': False }, 'Parameters': { 'omega': self.omega, @@ -99,9 +108,13 @@ class Scenario: 'initShearFlow': self.init_shear_flow, 'gpuBlockSize': self.cuda_blocks, 'innerOuterSplit': self.inner_outer_split, - 'vtkWriteFrequency': self.vtk_write_frequency + 'vtkWriteFrequency': self.vtk_write_frequency, + 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency } } + if self.boundary_setup: + config_dict["Boundaries"] = ldc_setup + if print_dict: wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) if self.additional_info: @@ -219,90 +232,30 @@ def single_gpu_benchmark(): scenarios.add(scenario) -# -------------------------------------- Optional job script generation for PizDaint --------------------------------- - - -job_script_header = """ -#!/bin/bash -l -#SBATCH --job-name=scaling -#SBATCH --time=01:00:00 -#SBATCH --nodes={nodes} -#SBATCH -o out_scaling_{nodes}_%j.txt -#SBATCH -e err_scaling_{nodes}_%j.txt -#SBATCH --ntasks-per-core=1 -#SBATCH --cpus-per-task=1 -#SBATCH --partition=normal -#SBATCH --constraint=gpu -#SBATCH --account=s1042 - -source ~/env.sh - -export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer -export CRAY_CUDA_MPS=1 # allow GPU sharing -export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages - -export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -export CRAY_CUDA_MPS=1 - -export MPICH_RANK_REORDER_METHOD=3 -export PMI_MMAP_SYNC_WAIT_TIME=300 - -cd {folder} -# grid_order -R -H -c 1,1,8 -g 16,16,8 - -ulimit -c 0 -""" - -job_script_exe_part = """ - -export WALBERLA_SCENARIO_IDX=0 -while srun -n {nodes} ./{app} {config} -do - ((WALBERLA_SCENARIO_IDX++)) -done -""" - -streaming_patterns = ['pull', 'push', 'aa', 'esotwist'] -stencils = ['d3q27', 'd3q19'] -methods = ['srt', 'mrt', 'cumulant', 'entropic'] - -all_executables = [] - -for stencil in stencils: - for streaming_pattern in streaming_patterns: - for method in methods: - all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}") - -all_executables = tuple(all_executables) - - -def generate_jobscripts(exe_names=all_executables): - for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: - folder_name = "scaling_{:04d}".format(node_count) - os.makedirs(folder_name, exist_ok=True) - - # run grid_order - import subprocess - decomposition = block_decomposition(node_count) - decomposition_str = ",".join(str(e) for e in decomposition) - subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str]) - - job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name)) - for exe in exe_names: - job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, - config='../communication_compare.py') - - with open(os.path.join(folder_name, 'job.sh'), 'w') as f: - f.write(job_script) +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + wlb.log_info_on_root("") + time_step_strategy = "noOverlap" # "noOverlap" -if __name__ == '__main__': - print("Called without waLBerla - generating job scripts for PizDaint") - generate_jobscripts() -else: - wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") - # Select the benchmark you want to run - single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU - # performance of compute kernel (no communication) - # overlap_benchmark() # benchmarks different communication overlap options - # profiling() # run only two timesteps on a smaller domain for profiling only + scenarios = wlb.ScenarioManager() + scenario = Scenario(cells_per_block=(64, 64, 64), + time_step_strategy=time_step_strategy, + timesteps=1000, + outer_iterations=1, + warmup_steps=0, + init_shear_flow=False, + boundary_setup=True, + vtk_write_frequency=0, + remaining_time_logger_frequency=10) + scenarios.add(scenario) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +# Select the benchmark you want to run +# single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU +# performance of compute kernel (no communication) +# overlap_benchmark() # benchmarks different communication overlap options +# profiling() # run only two timesteps on a smaller domain for profiling only +validation_run() diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt index 95b852203ce277cf293011694e9f39e8063417c9..61e4464d18c4ea1a5ee056f26792c60f6af71250 100644 --- a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt +++ b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt @@ -4,16 +4,16 @@ waLBerla_link_files_to_builddir(*.obj) waLBerla_generate_target_from_python(NAME PhaseFieldCodeGenGPU FILE multiphase_codegen.py - OUT_FILES initialize_phase_field_distributions.cu initialize_phase_field_distributions.h - initialize_velocity_based_distributions.cu initialize_velocity_based_distributions.h - phase_field_LB_step.cu phase_field_LB_step.h - phase_field_LB_NoSlip.cu phase_field_LB_NoSlip.h - hydro_LB_step.cu hydro_LB_step.h - hydro_LB_NoSlip.cu hydro_LB_NoSlip.h - PackInfo_phase_field_distributions.cu PackInfo_phase_field_distributions.h - PackInfo_phase_field.cu PackInfo_phase_field.h - PackInfo_velocity_based_distributions.cu PackInfo_velocity_based_distributions.h - ContactAngle.cu ContactAngle.h + OUT_FILES initialize_phase_field_distributions.${CODEGEN_FILE_SUFFIX} initialize_phase_field_distributions.h + initialize_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} initialize_velocity_based_distributions.h + phase_field_LB_step.${CODEGEN_FILE_SUFFIX} phase_field_LB_step.h + phase_field_LB_NoSlip.${CODEGEN_FILE_SUFFIX} phase_field_LB_NoSlip.h + hydro_LB_step.${CODEGEN_FILE_SUFFIX} hydro_LB_step.h + hydro_LB_NoSlip.${CODEGEN_FILE_SUFFIX} hydro_LB_NoSlip.h + PackInfo_phase_field_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field_distributions.h + PackInfo_phase_field.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field.h + PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h + ContactAngle.${CODEGEN_FILE_SUFFIX} ContactAngle.h GenDefines.h) waLBerla_add_executable(NAME multiphaseGPU diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.cpp b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp index e73ec19962dff4cb89f523e83f6466fb685b1c69..2cfc8b30b94e1bad57508d23f5a672de7ccc8df5 100644 --- a/apps/tutorials/gpu/01_GameOfLife_cuda.cpp +++ b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp @@ -30,6 +30,7 @@ #include "gpu/communication/MemcpyPackInfo.h" #include "gpu/communication/UniformGPUScheme.h" +#include "field/AddToStorage.h" #include "field/vtk/VTKWriter.h" #include "geometry/initializer/ScalarFieldFromGrayScaleImage.h" @@ -48,21 +49,6 @@ using CommScheme = gpu::communication::UniformGPUScheme<stencil::D2Q9 > ; using Packing = gpu::communication::MemcpyPackInfo<GPUField> ; -ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage ) -{ - auto xSize = storage->getNumberOfXCells( *block ); - auto ySize = storage->getNumberOfYCells( *block ); - auto zSize = storage->getNumberOfZCells( *block ); - auto numberOfGhostLayers = uint_c(1); - auto initialValue = real_c(0); - auto fieldLayout = field::fzyx; - return new ScalarField (xSize, ySize, zSize, - numberOfGhostLayers, initialValue, fieldLayout, - make_shared< gpu::HostFieldAllocator<real_t> >() // allocator for host pinned memory - ); -} - - int main( int argc, char ** argv ) { walberla::Environment const env( argc, argv ); @@ -78,7 +64,8 @@ int main( int argc, char ** argv ) false, false, false ); // no periodicity - BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" ); + auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >(); + BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator); // Initializing the field from an image using geometry::initializer::ScalarFieldFromGrayScaleImage; diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.dox b/apps/tutorials/gpu/01_GameOfLife_cuda.dox index 77c83e5f66df3f2cf6ab612099dd62c1bdbfcb69..8794e6c520ffb31d2c3653622cb2f4b4ba4b6eda 100644 --- a/apps/tutorials/gpu/01_GameOfLife_cuda.dox +++ b/apps/tutorials/gpu/01_GameOfLife_cuda.dox @@ -33,18 +33,8 @@ Data transfer from pinned memory is faster than from normal memory. The usage of mandatory, the data transfer functions work (slightly slower) also with normally allocated fields. \code -ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage ) -{ - return new ScalarField ( - storage->getNumberOfXCells( *block ), // number of cells in x direction per block - storage->getNumberOfYCells( *block ), // number of cells in y direction per block - storage->getNumberOfZCells( *block ), // number of cells in z direction per block - 1, // one ghost layer - real_t(0), // initial value - field::fzyx, // layout - make_shared<gpu::HostFieldAllocator<double> >() // allocator for host pinned memory - ); -} +auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >(); +BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator); \endcode Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 . diff --git a/python/lbmpy_walberla/__init__.py b/python/lbmpy_walberla/__init__.py index 15de37c8112e16a21476ba3e388adc843af92956..deb96e02ed3e5e5acfff016b7f185676788b7a76 100644 --- a/python/lbmpy_walberla/__init__.py +++ b/python/lbmpy_walberla/__init__.py @@ -1,8 +1,16 @@ from .boundary import generate_boundary, generate_alternating_lbm_boundary +from .boundary_collection import lbm_boundary_generator, generate_boundary_collection from .walberla_lbm_generation import RefinementScaling, generate_lattice_model +from .storage_specification import generate_lbm_storage_specification +from .sweep_collection import generate_lbm_sweep_collection from .packinfo import generate_lb_pack_info +from .packing_kernels import generate_packing_kernels from .alternating_sweeps import generate_alternating_lbm_sweep +from .walberla_lbm_package import generate_lbm_package __all__ = ['generate_lattice_model', 'generate_alternating_lbm_sweep', - 'RefinementScaling', 'generate_boundary', 'generate_alternating_lbm_boundary', - 'generate_lb_pack_info'] + 'generate_lbm_storage_specification', 'generate_lbm_sweep_collection', + 'RefinementScaling', 'lbm_boundary_generator', 'generate_boundary_collection', 'generate_boundary', + 'generate_alternating_lbm_boundary', + 'generate_lb_pack_info', 'generate_packing_kernels', + 'generate_lbm_package'] diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py index dbcc1ab54e618101658a2c2262dac946f9d99805..444a2000adb65c3ad66bfc028f7bedcab4e60896 100644 --- a/python/lbmpy_walberla/alternating_sweeps.py +++ b/python/lbmpy_walberla/alternating_sweeps.py @@ -1,14 +1,17 @@ from dataclasses import replace +from typing import Set import numpy as np -from pystencils_walberla.codegen import generate_selective_sweep, config_from_context -from pystencils_walberla.kernel_selection import ( - AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode) from pystencils import Target, TypedSymbol from lbmpy.creationfunctions import create_lb_ast from lbmpy.advanced_streaming import Timestep, is_inplace +from pystencils_walberla.sweep import generate_selective_sweep +from pystencils_walberla.kernel_selection import ( + AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode) +from pystencils_walberla.utility import config_from_context + class EvenIntegerCondition(AbstractConditionNode): def __init__(self, parameter_name: str, @@ -54,7 +57,7 @@ class TimestepTrackerMapping(AbstractInterfaceArgumentMapping): return f"{self.tracker_symbol.name}->getCounter()" @property - def headers(self): + def headers(self) -> Set: return {'"lbm/inplace_streaming/TimestepTracker.h"'} diff --git a/python/lbmpy_walberla/boundary_collection.py b/python/lbmpy_walberla/boundary_collection.py new file mode 100644 index 0000000000000000000000000000000000000000..17bfa245a3212404c35dd06c420fcb19a55c3049 --- /dev/null +++ b/python/lbmpy_walberla/boundary_collection.py @@ -0,0 +1,147 @@ +from jinja2 import Environment, PackageLoader, StrictUndefined + +import pystencils_walberla.boundary +from lbmpy.boundaries.boundaryconditions import LbBoundary +from lbmpy.boundaries.boundaryhandling import create_lattice_boltzmann_boundary_kernel +from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from lbmpy.advanced_streaming import Timestep, is_inplace + +from pystencils_walberla.kernel_selection import KernelCallNode +from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition, OddIntegerCondition, TimestepTrackerMapping +from lbmpy_walberla.additional_data_handler import default_additional_data_handler + +from pystencils import Target + +import numpy as np + + +def lbm_boundary_generator(class_name: str, flag_uid: str, boundary_object: LbBoundary, additional_data_handler=None): + def generation_function(ctx, lb_method, field_name='pdfs', + streaming_pattern='pull', after_collision=True, + namespace='lbm', + **create_kernel_params): + context = __generate_alternating_lbm_boundary(generation_context=ctx, + class_name=class_name, + boundary_object=boundary_object, + lb_method=lb_method, + field_name=field_name, + streaming_pattern=streaming_pattern, + after_collision=after_collision, + additional_data_handler=additional_data_handler, + namespace=namespace, + **create_kernel_params) + + return context + + return {'flag_id': flag_uid, 'generator': generation_function} + + +def generate_boundary_collection(generation_context, + class_name, + boundary_generators, + lb_method, + field_name='pdfs', + streaming_pattern='pull', + prev_timestep=Timestep.BOTH, + namespace='lbm', + **create_kernel_params): + + kernel_list = [] + includes = [] + boundary_classes = [] + flag_uids = [] + object_names = [] + targets = [] + + for boundary_generator in boundary_generators: + boundary_functor = boundary_generator['generator'] + context = boundary_functor(generation_context, lb_method, field_name, streaming_pattern, prev_timestep, + namespace, **create_kernel_params) + + kernel_list.append(context['kernel']) + includes.append(f"\"{context['class_name']}.h\"") + boundary_classes.append(f"{context['namespace']}::{context['class_name']}") + flag_uids.append(boundary_generator['flag_id']) + object_names.append(f"{context['class_name']}Object") + targets.append(f"{context['target']}") + + assert len(set(targets)) == 1 + target = targets[0] + + jinja_context = { + 'kernel_list': kernel_list, + 'class_name': class_name, + 'target': target, + 'namespace': namespace, + 'includes': includes, + 'boundary_classes': boundary_classes, + 'flag_uids': flag_uids, + 'object_names': object_names + } + + env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined) + env.globals.update(zip=zip) + add_pystencils_filters_to_jinja_env(env) + + header = env.get_template("BoundaryCollection.tmpl.h").render(**jinja_context) + + generation_context.write_file(f"{class_name}.h", header) + + +# Internal +def __generate_alternating_lbm_boundary(generation_context, + class_name, + boundary_object, + lb_method, + field_name='pdfs', + streaming_pattern='pull', + after_collision=True, + additional_data_handler=None, + namespace='lbm', + **create_kernel_params): + if boundary_object.additional_data and additional_data_handler is None: + target = create_kernel_params.get('target', Target.CPU) + additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target) + + timestep_param_name = 'timestep' + timestep_param_dtype = np.uint8 + + def boundary_creation_function(field, index_field, stencil, boundary_functor, target=Target.CPU, **kwargs): + pargs = (field, index_field, lb_method, boundary_functor) + kwargs = {'target': target, **kwargs} + ast_even = create_lattice_boltzmann_boundary_kernel(*pargs, + streaming_pattern=streaming_pattern, + prev_timestep=Timestep.EVEN, + **kwargs) + ast_even.function_name = 'even' + kernel_even = KernelCallNode(ast_even) + + if is_inplace(streaming_pattern): + ast_odd = create_lattice_boltzmann_boundary_kernel(*pargs, + streaming_pattern=streaming_pattern, + prev_timestep=Timestep.ODD, + **kwargs) + ast_odd.function_name = 'odd' + kernel_odd = KernelCallNode(ast_odd) + else: + kernel_odd = kernel_even + + if after_collision: + return EvenIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype) + else: + return OddIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype) + + timestep_advancement = {"field_name": field_name, "function": "getTimestep"} + + context = pystencils_walberla.boundary.generate_boundary(generation_context, + class_name, + boundary_object, + field_name=field_name, + neighbor_stencil=lb_method.stencil, + index_shape=[lb_method.stencil.Q], + kernel_creation_function=boundary_creation_function, + namespace=namespace, + additional_data_handler=additional_data_handler, + field_timestep=timestep_advancement, + **create_kernel_params) + return context diff --git a/python/lbmpy_walberla/function_generator.py b/python/lbmpy_walberla/function_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8e3d552c2daa42bb12223150b65a7de0a59f8b3a --- /dev/null +++ b/python/lbmpy_walberla/function_generator.py @@ -0,0 +1,26 @@ +from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec + + +def kernel_family_function_generator(class_name: str, kernel_family: KernelFamily, + namespace: str = 'lbm', max_threads: int = None): + + return lambda: __function_generator(class_name, kernel_family, namespace, max_threads) + + +def __function_generator(class_name: str, kernel_family: KernelFamily, + namespace: str = 'lbm', max_threads: int = None): + + representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} + representative_field = sorted(representative_field)[0] + + interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, ()) + + jinja_context = { + 'kernel': kernel_family, + 'namespace': namespace, + 'function_name': class_name, + 'field': representative_field, + 'interface_spec': interface_spec, + 'max_threads': max_threads + } + return jinja_context diff --git a/python/lbmpy_walberla/packinfo.py b/python/lbmpy_walberla/packinfo.py index 796ccfd9832b082610d5dc8b1065ddfa8450ca36..b53ef743f03c0f4c75128f0cce3d3fbaffbff593 100644 --- a/python/lbmpy_walberla/packinfo.py +++ b/python/lbmpy_walberla/packinfo.py @@ -6,7 +6,7 @@ from lbmpy.advanced_streaming.communication import _extend_dir from pystencils import Assignment, Field, Target from pystencils.stencil import inverse_direction -from pystencils_walberla.codegen import comm_directions, generate_pack_info +from pystencils_walberla.pack_info import _comm_directions, generate_pack_info def generate_lb_pack_info(generation_context, @@ -65,7 +65,7 @@ def generate_lb_pack_info(generation_context, if all(offset == 0 for offset in fa.offsets): continue comm_direction = inverse_direction(fa.offsets) - for comm_dir in comm_directions(comm_direction): + for comm_dir in _comm_directions(comm_direction): common_spec[(comm_dir,)].add(fa.field.center(*fa.index)) full_stencil = LBStencil(Stencil.D3Q27) if stencil.D == 3 else LBStencil(Stencil.D2Q9) diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..985193f1434dd43d4294067a46ea7ba2ac01dbb3 --- /dev/null +++ b/python/lbmpy_walberla/packing_kernels.py @@ -0,0 +1,462 @@ +from dataclasses import replace +from itertools import product + +import numpy as np +import sympy as sp + +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target +from pystencils.stencil import offset_to_direction_string +from pystencils.typing import TypedSymbol +from pystencils.stencil import inverse_direction +from pystencils.bit_masks import flag_cond + +from lbmpy.advanced_streaming import get_accessor, is_inplace, get_timesteps, Timestep +from lbmpy.advanced_streaming.communication import _extend_dir +from lbmpy.enums import Stencil +from lbmpy.stencils import LBStencil + +from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode +from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.utility import config_from_context + +from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition +from lbmpy_walberla.utility import timestep_suffix + + +def generate_packing_kernels(generation_context, class_name: str, stencil: LBStencil, streaming_pattern: str = 'pull', + namespace='lbm', nonuniform: bool = False, + target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False, + **create_kernel_params): + + config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + **create_kernel_params) + + # Packing kernels should never be vectorised + config = replace(config, cpu_vectorize_info=None) + + default_dtype = config.data_type.default_factory() + is_float = True if issubclass(default_dtype.numpy_dtype.type, np.float32) else False + + cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config) + + kernels = cg.create_uniform_kernel_families() + + if nonuniform: + kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels) + + jinja_context = { + 'class_name': class_name, + 'namespace': namespace, + 'nonuniform': nonuniform, + 'target': target.name.lower(), + 'dtype': "float" if is_float else "double", + 'is_gpu': target == Target.GPU, + 'kernels': kernels, + 'inplace': is_inplace(streaming_pattern), + 'direction_sizes': cg.get_direction_sizes(), + 'stencil_size': stencil.Q, + 'dimension': stencil.D, + 'src_field': cg.src_field, + 'dst_field': cg.dst_field + } + + if nonuniform: + jinja_context['mask_field'] = cg.mask_field + + template_name = "NonuniformPackingKernels" if nonuniform else "PackingKernels" + + env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined) + add_pystencils_filters_to_jinja_env(env) + header = env.get_template(f"{template_name}.tmpl.h").render(**jinja_context) + source = env.get_template(f"{template_name}.tmpl.cpp").render(**jinja_context) + + source_extension = "cpp" if target == Target.CPU else "cu" + generation_context.write_file(f"{class_name}.h", header) + generation_context.write_file(f"{class_name}.{source_extension}", source) + + +# ------------------------------ INTERNAL ---------------------------------------------------------------------------- + +class PackingKernelsCodegen: + + def __init__(self, stencil, streaming_pattern, class_name, config: CreateKernelConfig): + self.stencil = stencil + self.dim = stencil.D + self.values_per_cell = stencil.Q + self.full_stencil = LBStencil(Stencil.D3Q27) if self.dim == 3 else LBStencil(Stencil.D2Q9) + self.streaming_pattern = streaming_pattern + self.inplace = is_inplace(streaming_pattern) + self.class_name = class_name + self.config = config + self.data_type = config.data_type['pdfs'].numpy_dtype + + self.src_field, self.dst_field = fields( + f'pdfs_src({self.values_per_cell}), pdfs_dst({self.values_per_cell}) :{self.data_type}[{self.dim}D]') + self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)] + self.mask_field = fields(f'mask : uint32 [{self.dim}D]') + + def create_uniform_kernel_families(self, kernels_dict=None): + kernels = dict() if kernels_dict is None else kernels_dict + + kernels['packAll'] = self.get_pack_all_kernel_family() + kernels['unpackAll'] = self.get_unpack_all_kernel_family() + kernels['localCopyAll'] = self.get_local_copy_all_kernel_family() + + kernels['packDirection'] = self.get_pack_direction_kernel_family() + kernels['unpackDirection'] = self.get_unpack_direction_kernel_family() + kernels['localCopyDirection'] = self.get_local_copy_direction_kernel_family() + return kernels + + def create_nonuniform_kernel_families(self, kernels_dict=None): + kernels = dict() if kernels_dict is None else kernels_dict + kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family() + kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family() + kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family() + kernels['unpackCoalescence'] = self.get_unpack_coalescence_kernel_family() + + return kernels + + # --------------------------- Pack / Unpack / LocalCopy All -------------------------------------------------------- + + def get_pack_all_ast(self, timestep): + config = replace(self.config, ghost_layers=0) + + buffer = self._buffer(self.values_per_cell) + src, _ = self._stream_out_accs(timestep) + assignments = [Assignment(buffer(i), src[i]) for i in range(self.values_per_cell)] + ast = create_kernel(assignments, config=config) + ast.function_name = 'pack_ALL' + timestep_suffix(timestep) + return ast + + def get_pack_all_kernel_family(self): + if not self.inplace: + tree = KernelCallNode(self.get_pack_all_ast(Timestep.BOTH)) + else: + even_call = KernelCallNode(self.get_pack_all_ast(Timestep.EVEN)) + odd_call = KernelCallNode(self.get_pack_all_ast(Timestep.ODD)) + tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8) + return KernelFamily(tree, self.class_name) + + def get_unpack_all_ast(self, timestep): + config = replace(self.config, ghost_layers=0) + + buffer = self._buffer(self.values_per_cell) + _, dst = self._stream_out_accs(timestep) + assignments = [Assignment(dst[i], buffer(i)) for i in range(self.values_per_cell)] + ast = create_kernel(assignments, config=config) + ast.function_name = 'unpack_ALL' + timestep_suffix(timestep) + return ast + + def get_unpack_all_kernel_family(self): + if not self.inplace: + tree = KernelCallNode(self.get_unpack_all_ast(Timestep.BOTH)) + else: + even_call = KernelCallNode(self.get_unpack_all_ast(Timestep.EVEN)) + odd_call = KernelCallNode(self.get_unpack_all_ast(Timestep.ODD)) + tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8) + return KernelFamily(tree, self.class_name) + + def get_local_copy_all_ast(self, timestep): + config = replace(self.config, ghost_layers=0) + + src, dst = self._stream_out_accs(timestep) + assignments = [Assignment(dst[i], src[i]) for i in range(self.values_per_cell)] + ast = create_kernel(assignments, config=config) + ast.function_name = 'localCopy_ALL' + timestep_suffix(timestep) + return ast + + def get_local_copy_all_kernel_family(self): + if not self.inplace: + tree = KernelCallNode(self.get_local_copy_all_ast(Timestep.BOTH)) + else: + even_call = KernelCallNode(self.get_local_copy_all_ast(Timestep.EVEN)) + odd_call = KernelCallNode(self.get_local_copy_all_ast(Timestep.ODD)) + tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8) + return KernelFamily(tree, self.class_name) + + # --------------------------- Pack / Unpack / LocalCopy Direction -------------------------------------------------- + + def get_pack_direction_ast(self, comm_dir, timestep): + config = replace(self.config, ghost_layers=0) + + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(comm_dir) + buffer = self._buffer(len(streaming_dirs)) + src, _ = self._stream_out_accs(timestep) + assignments = [] + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + for i, d in enumerate(dir_indices): + assignments.append(Assignment(buffer(i), src[d])) + ast = create_kernel(assignments, config=config) + ast.function_name = f'pack_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_pack_direction_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_pack_direction_ast) + + def get_unpack_direction_ast(self, comm_dir, timestep): + config = replace(self.config, ghost_layers=0) + + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir)) + buffer = self._buffer(len(streaming_dirs)) + _, dst = self._stream_out_accs(timestep) + assignments = [] + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + for i, d in enumerate(dir_indices): + assignments.append(Assignment(dst[d], buffer(i))) + ast = create_kernel(assignments, config=config) + ast.function_name = f'unpack_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_unpack_direction_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_unpack_direction_ast) + + def get_local_copy_direction_ast(self, comm_dir, timestep): + config = replace(self.config, ghost_layers=0) + + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(comm_dir) + src, dst = self._stream_out_accs(timestep) + assignments = [] + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + for direction in dir_indices: + assignments.append(Assignment(dst[direction], src[direction])) + ast = create_kernel(assignments, config=config) + ast.function_name = f'localCopy_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_local_copy_direction_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_local_copy_direction_ast) + + # --------------------------- Pack / Unpack / LocalCopy Coarse to Fine --------------------------------------------- + + def get_unpack_redistribute_ast(self, comm_dir, timestep): + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir)) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + buffer = self._buffer(self.values_per_cell) + _, dst = self._stream_out_accs(timestep) + orthos = self.orthogonal_principals(comm_dir) + sub_dirs = self.contained_principals(comm_dir) + orthogonal_combinations = self.linear_combinations(orthos) + subdir_combinations = self.linear_combinations_nozero(sub_dirs) + second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)] + negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir]) + assignments = [] + for offset in orthogonal_combinations: + o = offset + negative_dir_correction + for d in range(self.values_per_cell): + field_acc = dst[d].get_shifted(*o) + assignments.append(Assignment(field_acc, buffer(d))) + + for offset in second_gl_dirs: + o = offset + negative_dir_correction + for d in dir_indices: + field_acc = dst[d].get_shifted(*o) + assignments.append(Assignment(field_acc, buffer(d))) + + function_name = f'unpackRedistribute_{dir_string}' + timestep_suffix(timestep) + iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim)) + config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice, + data_type=self.data_type, ghost_layers=0, allow_double_writes=True, + cpu_openmp=self.config.cpu_openmp, target=self.config.target) + + return create_kernel(assignments, config=config) + + def get_unpack_redistribute_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast) + + def get_local_copy_redistribute_ast(self, comm_dir, timestep): + # TODO + raise NotImplementedError() + + def get_local_copy_redistribute_kernel_family(self): + # TODO + raise NotImplementedError() + + # --------------------------- Pack / Unpack / LocalCopy Fine to Coarse --------------------------------------------- + + def get_pack_partial_coalescence_ast(self, comm_dir, timestep): + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(comm_dir) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + buffer = self._buffer(self.values_per_cell) + src, _ = self._stream_in_accs(timestep.next()) + mask = self.mask_field + + offsets = list(product(*((0, 1) for _ in comm_dir))) + assignments = [] + for i, d in enumerate(dir_indices): + acc = 0 + for o in offsets: + acc += flag_cond(d, mask[o], src[d].get_shifted(*o)) + assignments.append(Assignment(buffer(i), acc)) + + iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim)) + config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0) + + ast = create_kernel(assignments, config=config) + ast.function_name = f'packPartialCoalescence_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_pack_partial_coalescence_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast) + + def get_unpack_coalescence_ast(self, comm_dir, timestep): + config = replace(self.config, ghost_layers=0) + + assert not all(d == 0 for d in comm_dir) + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir)) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + buffer = self._buffer(self.values_per_cell) + _, dst = self._stream_in_accs(timestep.next()) + + coalescence_factor = sp.Rational(1, 2 ** self.dim) + + assignments = [] + for i, d in enumerate(dir_indices): + assignments.append(Assignment(dst[d], dst[d] + coalescence_factor * buffer(i))) + + ast = create_kernel(assignments, config=config) + ast.function_name = f'unpackCoalescence_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_unpack_coalescence_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_unpack_coalescence_ast) + + def get_zero_coalescence_region_ast(self, comm_dir, timestep): + config = replace(self.config, ghost_layers=0) + + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir)) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + _, dst = self._stream_in_accs(timestep.next()) + + assignments = [] + for i, d in enumerate(dir_indices): + assignments.append(Assignment(dst[d], 0.0)) + + ast = create_kernel(assignments, config=config) + ast.function_name = f'zeroCoalescenceRegion_{dir_string}' + timestep_suffix(timestep) + return ast + + def get_zero_coalescence_region_kernel_family(self): + return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast) + + # TODO + def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep): + raise NotImplementedError() + + def get_local_copy_partial_coalescence_kernel_family(self): + raise NotImplementedError() + + # ------------------------------------------ Utility --------------------------------------------------------------- + + def get_streaming_dirs(self, comm_dir): + if all(d == 0 for d in comm_dir): + return set() + else: + return set(_extend_dir(comm_dir)) & set(self.stencil) + + def get_direction_sizes(self): + return [len(self.get_streaming_dirs(d)) for d in self.full_stencil] + + def principal(self, i): + e_i = np.zeros(self.dim, dtype=int) + e_i[i] = 1 + return e_i + + def principals(self): + """Returns the principal directions for the given dimension""" + return tuple(self.principal(i) for i in range(self.dim)) + + def orthogonal_principals(self, comm_dir): + """Returns the positive principal directions orthogonal to the comm_dir""" + return tuple(p for i, p in enumerate(self.principals()) if comm_dir[i] == 0) + + def contained_principals(self, comm_dir): + """Returns the (positive or negative) principal directions contained in comm_dir""" + vecs = [] + for i, d in enumerate(comm_dir): + if d != 0: + vecs.append(d * self.principal(i)) + return vecs + + def linear_combinations(self, vectors): + if not vectors: + return [np.zeros(self.dim, dtype=int)] + else: + rest = self.linear_combinations(vectors[1:]) + return rest + [vectors[0] + r for r in rest] + + def linear_combinations_nozero(self, vectors): + if len(vectors) == 1: + return [vectors[0]] + else: + rest = self.linear_combinations_nozero(vectors[1:]) + return rest + [vectors[0]] + [vectors[0] + r for r in rest] + + # --------------------------- Private Members ---------------------------------------------------------------------- + + def _construct_directionwise_kernel_family(self, create_ast_callback): + subtrees = [] + direction_symbol = TypedSymbol('dir', dtype='stencil::Direction') + for t in get_timesteps(self.streaming_pattern): + cases_dict = dict() + for comm_dir in self.full_stencil: + if all(d == 0 for d in comm_dir): + continue + dir_string = offset_to_direction_string(comm_dir) + ast = create_ast_callback(comm_dir, t) + if ast is None: + continue + kernel_call = KernelCallNode(ast) + cases_dict[f"stencil::{dir_string}"] = kernel_call + subtrees.append(SwitchNode(direction_symbol, cases_dict)) + + if not self.inplace: + tree = subtrees[0] + else: + tree = EvenIntegerCondition('timestep', subtrees[Timestep.EVEN.idx], subtrees[Timestep.ODD.idx], + parameter_dtype=np.uint8) + return KernelFamily(tree, self.class_name) + + def _stream_out_accs(self, timestep): + accessor = self.accessors[timestep.idx] + src_stream_out_accs = accessor.write(self.src_field, self.stencil) + dst_stream_out_accs = accessor.write(self.dst_field, self.stencil) + return src_stream_out_accs, dst_stream_out_accs + + def _stream_in_accs(self, timestep): + accessor = self.accessors[timestep.idx] + src_stream_in_accs = accessor.read(self.src_field, self.stencil) + dst_stream_in_accs = accessor.read(self.dst_field, self.stencil) + return src_stream_in_accs, dst_stream_in_accs + + def _buffer(self, size): + return Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER, + dtype=self.data_type, + index_shape=(size,)) diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py new file mode 100644 index 0000000000000000000000000000000000000000..c113604381be85e3895e3744285a528d4786f84e --- /dev/null +++ b/python/lbmpy_walberla/storage_specification.py @@ -0,0 +1,88 @@ +# import warnings + +from dataclasses import replace +from jinja2 import Environment, PackageLoader, StrictUndefined +import numpy as np + +from pystencils import Target + +from lbmpy import LBMConfig +from lbmpy.advanced_streaming import is_inplace +from lbmpy.methods import AbstractLbMethod + +from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.utility import config_from_context +from lbmpy_walberla.packing_kernels import PackingKernelsCodegen + + +def generate_lbm_storage_specification(generation_context, class_name: str, + method: AbstractLbMethod, lbm_config: LBMConfig, nonuniform: bool = False, + target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False, + **create_kernel_params): + namespace = "lbm" + stencil = method.stencil + stencil_name = stencil.name + streaming_pattern = lbm_config.streaming_pattern + + config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + **create_kernel_params) + + # Packing kernels should never be vectorised + config = replace(config, cpu_vectorize_info=None) + + default_dtype = config.data_type.default_factory() + is_float = True if issubclass(default_dtype.numpy_dtype.type, np.float32) else False + + cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config) + kernels = cg.create_uniform_kernel_families() + + if nonuniform: + kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels) + + values_per_cell = len(stencil) + dimension = len(stencil[0]) + + # Pure storage specification + if not stencil_name: + raise ValueError("lb_method uses a stencil that is not supported in waLBerla") + + communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27" + + cqc = method.conserved_quantity_computation + equilibrium = method.equilibrium_distribution + + jinja_context = { + 'class_name': class_name, + 'namespace': namespace, + 'stencil_name': stencil_name, + 'communication_stencil_name': communication_stencil_name, + 'compressible': cqc.compressible, + 'equilibriumAccuracyOrder': equilibrium.order, + 'inplace': is_inplace(streaming_pattern), + 'zero_centered': cqc.zero_centered_pdfs, + 'eq_deviation_only': equilibrium.deviation_only, + + 'nonuniform': nonuniform, + 'target': target.name.lower(), + 'dtype': "float" if is_float else "double", + 'is_gpu': target == Target.GPU, + 'kernels': kernels, + 'direction_sizes': cg.get_direction_sizes(), + 'stencil_size': stencil.Q, + 'dimension': stencil.D, + 'src_field': cg.src_field, + 'dst_field': cg.dst_field + + } + if nonuniform: + jinja_context['mask_field'] = cg.mask_field + + env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined) + add_pystencils_filters_to_jinja_env(env) + + header = env.get_template('LbmStorageSpecification.tmpl.h').render(**jinja_context) + source = env.get_template('LbmStorageSpecification.tmpl.cpp').render(**jinja_context) + + source_extension = "cpp" if target == Target.CPU else "cu" + generation_context.write_file(f"{class_name}.h", header) + generation_context.write_file(f"{class_name}.{source_extension}", source) diff --git a/python/lbmpy_walberla/sweep_collection.py b/python/lbmpy_walberla/sweep_collection.py new file mode 100644 index 0000000000000000000000000000000000000000..8edd0779b328de768cba4a3acb5f04bdb6bb3acf --- /dev/null +++ b/python/lbmpy_walberla/sweep_collection.py @@ -0,0 +1,230 @@ +from dataclasses import replace +from typing import Dict + +import sympy as sp +import numpy as np + +from pystencils import Target, create_kernel +from pystencils.config import CreateKernelConfig +from pystencils.field import Field + +from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep +from lbmpy.creationfunctions import LbmCollisionRule +from lbmpy.fieldaccess import CollideOnlyInplaceAccessor +from lbmpy.macroscopic_value_kernels import macroscopic_values_setter, macroscopic_values_getter +from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel + +from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily +from pystencils_walberla.utility import config_from_context +from pystencils_walberla import generate_sweep_collection + +from .alternating_sweeps import EvenIntegerCondition +from .function_generator import kernel_family_function_generator + + +def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmCollisionRule, + streaming_pattern='pull', + field_layout='fzyx', refinement_scaling=None, + macroscopic_fields: Dict[str, Field] = None, + target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, + max_threads=None, + **create_kernel_params): + config = config_from_context(ctx, target=target, data_type=data_type, + cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params) + + # usually a numpy layout is chosen by default i.e. xyzf - which is bad for waLBerla where at least the spatial + # coordinates should be ordered in reverse direction i.e. zyx + lb_method = collision_rule.method + + q = lb_method.stencil.Q + dim = lb_method.stencil.D + + if field_layout == 'fzyx': + config.cpu_vectorize_info['assume_inner_stride_one'] = True + elif field_layout == 'zyxf': + config.cpu_vectorize_info['assume_inner_stride_one'] = False + + src_field = Field.create_generic('pdfs', dim, config.data_type['pdfs'].numpy_dtype, + index_dimensions=1, layout=field_layout, index_shape=(q,)) + if is_inplace(streaming_pattern): + dst_field = src_field + else: + dst_field = Field.create_generic('pdfs_tmp', dim, config.data_type['pdfs_tmp'].numpy_dtype, + index_dimensions=1, layout=field_layout, + index_shape=(q,)) + + config = replace(config, ghost_layers=0) + + function_generators = [] + + def family(name): + return lbm_kernel_family(class_name, name, collision_rule, streaming_pattern, src_field, dst_field, config) + + def generator(name, kernel_family): + return kernel_family_function_generator(name, kernel_family, namespace='lbm', max_threads=max_threads) + + function_generators.append(generator('streamCollide', family("streamCollide"))) + function_generators.append(generator('collide', family("collide"))) + function_generators.append(generator('stream', family("stream"))) + function_generators.append(generator('streamOnlyNoAdvancement', family("streamOnlyNoAdvancement"))) + + setter_family = get_setter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields, config) + setter_generator = kernel_family_function_generator('initialise', setter_family, + namespace='lbm', max_threads=max_threads) + function_generators.append(setter_generator) + + getter_family = get_getter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields, config) + getter_generator = kernel_family_function_generator('calculateMacroscopicParameters', getter_family, + namespace='lbm', max_threads=max_threads) + function_generators.append(getter_generator) + + generate_sweep_collection(ctx, class_name, function_generators, refinement_scaling) + + +class RefinementScaling: + def __init__(self): + self.scaling_info = [] + + def add_standard_relaxation_rate_scaling(self, viscosity_relaxation_rate): + self.add_scaling(viscosity_relaxation_rate) + + def add_scaling(self, parameter): + if isinstance(parameter, sp.Symbol): + self.scaling_info.append(parameter.name) + else: + raise ValueError("Only pure symbols allowed") + + +def lbm_kernel_family(class_name, kernel_name, + collision_rule, streaming_pattern, src_field, dst_field, config: CreateKernelConfig): + + if kernel_name == "streamCollide": + def lbm_kernel(field_accessor, lb_stencil): + return create_lbm_kernel(collision_rule, src_field, dst_field, field_accessor) + advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"} + temporary_fields = ['pdfs_tmp'] + field_swaps = [('pdfs', 'pdfs_tmp')] + elif kernel_name == "collide": + def lbm_kernel(field_accessor, lb_stencil): + return create_lbm_kernel(collision_rule, src_field, dst_field, CollideOnlyInplaceAccessor()) + advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"} + temporary_fields = () + field_swaps = () + elif kernel_name == "stream": + def lbm_kernel(field_accessor, lb_stencil): + return create_stream_only_kernel(lb_stencil, src_field, dst_field, field_accessor) + advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"} + temporary_fields = ['pdfs_tmp'] + field_swaps = [('pdfs', 'pdfs_tmp')] + elif kernel_name == "streamOnlyNoAdvancement": + def lbm_kernel(field_accessor, lb_stencil): + return create_stream_only_kernel(lb_stencil, src_field, dst_field, field_accessor) + advance_timestep = {"field_name": src_field.name, "function": "getTimestepPlusOne"} + temporary_fields = ['pdfs_tmp'] + field_swaps = () + else: + raise ValueError(f"kernel name: {kernel_name} is not valid") + + lb_method = collision_rule.method + stencil = lb_method.stencil + + if is_inplace(streaming_pattern): + nodes = list() + for timestep in [Timestep.EVEN, Timestep.ODD]: + accessor = get_accessor(streaming_pattern, timestep) + timestep_suffix = str(timestep) + + update_rule = lbm_kernel(accessor, stencil) + ast = create_kernel(update_rule, config=config) + ast.function_name = 'kernel_' + kernel_name + timestep_suffix + ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one'] + nodes.append(KernelCallNode(ast)) + + tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8) + family = KernelFamily(tree, class_name, field_timestep=advance_timestep) + else: + timestep = Timestep.BOTH + accessor = get_accessor(streaming_pattern, timestep) + + update_rule = lbm_kernel(accessor, stencil) + ast = create_kernel(update_rule, config=config) + ast.function_name = 'kernel_' + kernel_name + ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one'] + node = KernelCallNode(ast) + family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps) + + return family + + +def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig): + dim = lb_method.stencil.D + density = macroscopic_fields.get('density', 1.0) + velocity = macroscopic_fields.get('velocity', [0.0] * dim) + + get_timestep = {"field_name": pdfs.name, "function": "getTimestep"} + temporary_fields = () + field_swaps = () + + if is_inplace(streaming_pattern): + nodes = list() + for timestep in [Timestep.EVEN, Timestep.ODD]: + timestep_suffix = str(timestep) + setter = macroscopic_values_setter(lb_method, + density=density, velocity=velocity, pdfs=pdfs, + streaming_pattern=streaming_pattern, previous_timestep=timestep) + + setter_ast = create_kernel(setter, config=config) + setter_ast.function_name = 'kernel_initialise' + timestep_suffix + nodes.append(KernelCallNode(setter_ast)) + tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8) + family = KernelFamily(tree, class_name, field_timestep=get_timestep) + else: + timestep = Timestep.BOTH + setter = macroscopic_values_setter(lb_method, + density=density, velocity=velocity, pdfs=pdfs, + streaming_pattern=streaming_pattern, previous_timestep=timestep) + + setter_ast = create_kernel(setter, config=config) + setter_ast.function_name = 'kernel_initialise' + node = KernelCallNode(setter_ast) + family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps) + + return family + + +def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig): + density = macroscopic_fields.get('density', None) + velocity = macroscopic_fields.get('velocity', None) + + if density is None and velocity is None: + return None + + get_timestep = {"field_name": pdfs.name, "function": "getTimestep"} + temporary_fields = () + field_swaps = () + + if is_inplace(streaming_pattern): + nodes = list() + for timestep in [Timestep.EVEN, Timestep.ODD]: + timestep_suffix = str(timestep) + getter = macroscopic_values_getter(lb_method, + density=density, velocity=velocity, pdfs=pdfs, + streaming_pattern=streaming_pattern, previous_timestep=timestep) + + getter_ast = create_kernel(getter, config=config) + getter_ast.function_name = 'kernel_getter' + timestep_suffix + nodes.append(KernelCallNode(getter_ast)) + tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8) + family = KernelFamily(tree, class_name, field_timestep=get_timestep) + else: + timestep = Timestep.BOTH + getter = macroscopic_values_getter(lb_method, + density=density, velocity=velocity, pdfs=pdfs, + streaming_pattern=streaming_pattern, previous_timestep=timestep) + + getter_ast = create_kernel(getter, config=config) + getter_ast.function_name = 'kernel_getter' + node = KernelCallNode(getter_ast) + family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps) + + return family diff --git a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h new file mode 100644 index 0000000000000000000000000000000000000000..5f49137846ba99d60888e7353ac4ff195ade2a84 --- /dev/null +++ b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h @@ -0,0 +1,108 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "domain_decomposition/IBlock.h" + +{% if target is equalto 'gpu' -%} +#include "gpu/GPUWrapper.h" +{%- endif %} + +{% for include in includes -%} +#include {{include}} +{% endfor %} + + +namespace walberla{ +namespace {{namespace}} { + +template <typename FlagField_T> +class {{class_name}} +{ + public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + + {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])]] | type_identifier_list -}} ) + : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_) + { + {% for object_name, boundary_class, kernel in zip(object_names, boundary_classes, kernel_list) -%} + + {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'])]] | type_identifier_list -}}); + {% endfor %} + + {% for object_name, flag_uid in zip(object_names, flag_uids) -%} + {{object_name}}->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("{{flag_uid}}"), domainUID); + {% endfor %} + } + + void run ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + {% for object_name in object_names -%} + {{object_name}}->run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); + {% endfor %} + } + + void inner ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + {% for object_name in object_names -%} + {{object_name}}->inner({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); + {% endfor %} + } + + void outer ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + {% for object_name in object_names -%} + {{object_name}}->outer({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); + {% endfor %} + } + + void operator() ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); + } + + std::function<void (IBlock *)> getSweep({{- ["Type type = Type::ALL", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + switch (type) + { + case Type::INNER: + return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->inner({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); }; + case Type::OUTER: + return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->outer({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); }; + default: + return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); }; + } + } + + weak_ptr< StructuredBlockStorage > blocks_; + BlockDataID flagID; + BlockDataID pdfsID; + walberla::FlagUID domainUID; + + {% for object_name, boundary_class in zip(object_names, boundary_classes) -%} + shared_ptr<{{boundary_class}}> {{object_name}}; + {% endfor %} +}; + +} +} + diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp index dd50337e1714c8abcf49c35b0d77e2e23d4d9c29..17d5bdeb4b5e7443958f9619d08848ae817b9a89 100644 --- a/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp +++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp @@ -13,7 +13,8 @@ // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // -//! \\author Martin Bauer <martin.bauer@fau.de> +//! \\file {{class_name}}.cpp +//! \\author lbmpy //====================================================================================================================== #include <cmath> diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.h b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h index 677be50025122939d50b10eb7d1381afe519eb4e..5631eec3250d2c1e99d9a59e268e5e1794520757 100644 --- a/python/lbmpy_walberla/templates/LatticeModel.tmpl.h +++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h @@ -13,8 +13,8 @@ // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // -//! \\author Martin Bauer <martin.bauer@fau.de> -// +//! \\file {{class_name}}.h +//! \\author lbmpy //====================================================================================================================== #pragma once diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..91c7d7d960a78552628d3d8568dd611f13c14a2d --- /dev/null +++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp @@ -0,0 +1,180 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.cpp +//! \\author lbmpy +//====================================================================================================================== + +#include "{{class_name}}.h" + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wfloat-equal" +# pragma GCC diagnostic ignored "-Wshadow" +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +/************************************************************************************* + * Kernel Definitions +*************************************************************************************/ +{{ kernels['packAll'] | generate_definitions }} +{{ kernels['unpackAll'] | generate_definitions }} +{{ kernels['localCopyAll'] | generate_definitions }} + +{{ kernels['packDirection'] | generate_definitions }} +{{ kernels['unpackDirection'] | generate_definitions }} +{{ kernels['localCopyDirection'] | generate_definitions }} + +{% if nonuniform -%} +{{ kernels['unpackRedistribute'] | generate_definitions }} +{{ kernels['packPartialCoalescence'] | generate_definitions }} +{{ kernels['zeroCoalescenceRegion'] | generate_definitions }} +{{ kernels['unpackCoalescence'] | generate_definitions }} +{%- endif %} + +/************************************************************************************* + * Kernel Wrappers +*************************************************************************************/ + +namespace walberla { +namespace {{namespace}} { + + void {{class_name}}::PackKernels::packAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + {{kernels['packAll'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + + void {{class_name}}::PackKernels::unpackAll( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + {{kernels['unpackAll'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + + void {{class_name}}::PackKernels::localCopyAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + {{kernels['localCopyAll'] + | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | indent(6) }} + } + + void {{class_name}}::PackKernels::packDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + {{kernels['packDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::unpackDirection( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + {{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::localCopyDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + {{kernels['localCopyDirection'] + | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | indent(6) }} + } + + {% if nonuniform -%} + void {{class_name}}::PackKernels::unpackRedistribute( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + {{kernels['unpackRedistribute'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::packPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + {{kernels['packPartialCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::zeroCoalescenceRegion( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + kernels['zeroCoalescenceRegion'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{kernels['zeroCoalescenceRegion'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::unpackCoalescence( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + {{kernels['unpackCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }} + } + {%- endif %} +} // namespace {{namespace}} +} // namespace walberla \ No newline at end of file diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h new file mode 100644 index 0000000000000000000000000000000000000000..866119390412a115f40283c7829cbdfc01e5baad --- /dev/null +++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h @@ -0,0 +1,255 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" +#include "core/mpi/SendBuffer.h" +#include "core/mpi/RecvBuffer.h" + +#include "domain_decomposition/IBlock.h" +#include "field/GhostLayerField.h" + +#include "stencil/{{stencil_name}}.h" +#include "stencil/Directions.h" + +{% if target is equalto 'cpu' -%} +#define FUNC_PREFIX +{%- elif target is equalto 'gpu' -%} +#define FUNC_PREFIX __global__ +#include "gpu/GPUWrapper.h" +#include "gpu/GPUField.h" +{%- endif %} + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +namespace walberla +{ +namespace {{namespace}}{ + +class {{class_name}} +{ + public: + // Used lattice stencil + using Stencil = stencil::{{stencil_name}}; + // Lattice stencil used for the communication (should be used to define which block directions need to be communicated) + using CommunicationStencil = stencil::{{communication_stencil_name}}; + + // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997 + static const bool compressible = {% if compressible %}true{% else %}false{% endif %}; + // Cut off for the lattice Boltzmann equilibrium + static const int equilibriumAccuracyOrder = {{equilibriumAccuracyOrder}}; + + // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push) + static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%}; + + // If true the background deviation (rho_0 = 1) is subtracted for the collision step. + static const bool zeroCenteredPDFs = {% if zero_centered -%} true {%- else -%} false {%- endif -%}; + // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho" + static const bool deviationOnlyEquilibrium = {% if eq_deviation_only -%} true {%- else -%} false {%- endif -%}; + + // Compute kernels to pack and unpack MPI buffers + class PackKernels { + + public: + using PdfField_T = {{src_field | field_type(is_gpu=is_gpu)}}; + using value_type = typename PdfField_T::value_type; + + {% if nonuniform -%} + {% if target is equalto 'cpu' -%} + using MaskField_T = GhostLayerField< uint32_t, 1 >; + {%- elif target is equalto 'gpu' -%} + using MaskField_T = gpu::GPUField< uint32_t >; + {%- endif %} + {%- endif %} + + static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%}; + + /** + * Packs all pdfs from the given cell interval to the send buffer. + * */ + void packAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks all pdfs from the send buffer to the given cell interval. + * */ + void unpackAll( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Copies data between two blocks on the same process. + * All pdfs from the sending interval are copied onto the receiving interval. + * */ + void localCopyAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed. + * */ + void packDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked. + * */ + void unpackDirection( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** Copies data between two blocks on the same process. + * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval. + * */ + void localCopyDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packDirection / unpackDirection + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci, stencil::Direction dir) const { + return ci.numCells() * sizes[dir] * sizeof(value_type); + } + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packAll / unpackAll + * @param ci The cell interval + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci) const { + return ci.numCells() * {{stencil_size}} * sizeof(value_type); + } + + {% if nonuniform -%} + + /** + * Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid. + * */ + void unpackRedistribute( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackRedistribute'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Partially coalesces and packs populations streaming from a fine block into a coarse block + * */ + void packPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Prepares a coarse block for coalescence by setting every population that must be coalesced from fine blocks to zero. + * */ + void zeroCoalescenceRegion( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + kernels['zeroCoalescenceRegion'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks and coalesces populations coming from a fine block onto the fine grid + * */ + void unpackCoalescence( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Returns the number of bytes that will be unpacked to the cell interval + * when using unpackRedistribute. This is 2^{-d} of the data that would be + * unpacked during same-level communication. + * @param ci The cell interval + * @return The required size of the buffer, in bytes + * */ + uint_t redistributeSize(CellInterval & ci) const { + return size(ci) >> {{dimension}}; + } + + /** + * Returns the number of bytes that will be packed from the cell interval + * when using packPartialCoalescence. + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + * */ + uint_t partialCoalescenceSize(CellInterval & ci, stencil::Direction dir) const { + return size(ci, dir) >> {{dimension}}; + } + + {%- endif %} + + private: + const uint_t sizes[{{direction_sizes|length}}] { {{ direction_sizes | join(', ') }} }; + }; + +}; + +}} //{{namespace}}/walberla \ No newline at end of file diff --git a/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..558dc1faa53a56c281b4cf106f7eb643d0ae9dae --- /dev/null +++ b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp @@ -0,0 +1,54 @@ +{% extends "PackingKernels.tmpl.cpp" %} + +{% block AdditionalKernelDefinitions %} +{{ kernels['unpackRedistribute'] | generate_definitions }} +{{ kernels['packPartialCoalescence'] | generate_definitions }} +{{ kernels['zeroCoalescenceRegion'] | generate_definitions }} +{{ kernels['unpackCoalescence'] | generate_definitions }} +{% endblock %} + +{% block AdditionalDefinitions %} + +void {{class_name}}::unpackRedistribute( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + + {{kernels['unpackRedistribute'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +void {{class_name}}::packPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + + {{kernels['packPartialCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +void {{class_name}}::zeroCoalescenceRegion( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + kernels['zeroCoalescenceRegion'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{kernels['zeroCoalescenceRegion'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +void {{class_name}}::unpackCoalescence( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + + {{kernels['unpackCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +{% endblock %} diff --git a/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h new file mode 100644 index 0000000000000000000000000000000000000000..01c99a69577f7928d830d743f9d0aba0b8584ea9 --- /dev/null +++ b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h @@ -0,0 +1,74 @@ +{% extends "PackingKernels.tmpl.h" %} + +{% block AdditionalPublicDeclarations %} +{% if target is equalto 'cpu' -%} + using MaskField_T = GhostLayerField< uint32_t, 1 >; +{%- elif target is equalto 'gpu' -%} + using MaskField_T = gpu::GPUField< uint32_t >; +{%- endif %} + + + /** + * Unpacks and uniformly redistributes populations coming from a coarse block onto + * the fine grid. + */ + void unpackRedistribute( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackRedistribute'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Partially coalesces and packs populations streaming from a fine block into a coarse block + */ + void packPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Prepares a coarse block for coalescence by setting every population that must be coalesced from fine blocks + * to zero. + */ + void zeroCoalescenceRegion( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + kernels['zeroCoalescenceRegion'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks and coalesces populations coming from a fine block onto the fine grid + */ + void unpackCoalescence( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Returns the number of bytes that will be unpacked to the cell interval + * when using unpackRedistribute. This is 2^{-d} of the data that would be + * unpacked during same-level communication. + * @param ci The cell interval + * @return The required size of the buffer, in bytes + */ + uint_t redistributeSize(CellInterval & ci) const { + return size(ci) >> {{dimension}}; + } + + /** + * Returns the number of bytes that will be packed from the cell interval + * when using packPartialCoalescence. + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + */ + uint_t partialCoalescenceSize(CellInterval & ci, stencil::Direction dir) const { + return size(ci, dir) >> {{dimension}}; + } +{% endblock %} diff --git a/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp b/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e536a9f8bbff9bf3fa3c2d907029112d9126053 --- /dev/null +++ b/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp @@ -0,0 +1,137 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.cpp +//! \\author lbmpy +//====================================================================================================================== + +#include "{{class_name}}.h" + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wfloat-equal" +# pragma GCC diagnostic ignored "-Wshadow" +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + + +/************************************************************************************* + * Kernel Definitions +*************************************************************************************/ +//NOLINTBEGIN(readability-non-const-parameter*) +{{ kernels['packAll'] | generate_definitions }} +{{ kernels['unpackAll'] | generate_definitions }} +{{ kernels['localCopyAll'] | generate_definitions }} + +{{ kernels['packDirection'] | generate_definitions }} +{{ kernels['unpackDirection'] | generate_definitions }} +{{ kernels['localCopyDirection'] | generate_definitions }} + +{% block AdditionalKernelDefinitions %} +{% endblock %} +//NOLINTEND(readability-non-const-parameter*) + + +/************************************************************************************* + * Kernel Wrappers +*************************************************************************************/ + +namespace walberla { +namespace {{namespace}} { + +void {{class_name}}::packAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + + {{kernels['packAll'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + + +void {{class_name}}::unpackAll( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + + {{kernels['unpackAll'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + + +void {{class_name}}::localCopyAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyAll'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + {{kernels['localCopyAll'] + | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | indent(3) }} +} + +void {{class_name}}::packDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer); + + {{kernels['packDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +void {{class_name}}::unpackDirection( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer); + + {{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }} +} + +void {{class_name}}::localCopyDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyDirection'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} +) const { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + {{kernels['localCopyDirection'] + | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | indent(3) }} +} + +{% block AdditionalDefinitions %} +{% endblock %} + +} // namespace {{namespace}} +} // namespace walberla \ No newline at end of file diff --git a/python/lbmpy_walberla/templates/PackingKernels.tmpl.h b/python/lbmpy_walberla/templates/PackingKernels.tmpl.h new file mode 100644 index 0000000000000000000000000000000000000000..5371e395d948e758efde91dafdb237fdb855aa9c --- /dev/null +++ b/python/lbmpy_walberla/templates/PackingKernels.tmpl.h @@ -0,0 +1,169 @@ +//====================================================================================================================== +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "stencil/Directions.h" +#include "core/cell/CellInterval.h" +#include "core/mpi/SendBuffer.h" +#include "core/mpi/RecvBuffer.h" +#include "domain_decomposition/IBlock.h" +#include "field/GhostLayerField.h" +{% if target is equalto 'gpu' -%} +#include "gpu/GPUWrapper.h" +#include "gpu/GPUField.h" +{%- endif %} + +{% if target is equalto 'cpu' -%} +#define FUNC_PREFIX +{%- elif target is equalto 'gpu' -%} +#define FUNC_PREFIX __global__ +{%- endif %} + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#ifdef WALBERLA_CXX_COMPILER_IS_GNU +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +#ifdef WALBERLA_CXX_COMPILER_IS_CLANG +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-parameter" +#endif + +namespace walberla{ +namespace {{namespace}} { + +class {{class_name}} { + +public: + using PdfField_T = {{src_field | field_type(is_gpu=is_gpu)}}; + using value_type = typename PdfField_T::value_type; + + static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%}; + + /** + * Packs all pdfs from the given cell interval to the send buffer. + */ + void packAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks all pdfs from the send buffer to the given cell interval. + */ + void unpackAll( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Copies data between two blocks on the same process. + * All pdfs from the sending interval are copied onto the receiving interval. + */ + void localCopyAll( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyAll'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Packs only those populations streaming in directions aligned with the sending direction dir from the given + * cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed. + */ + void packDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci", + "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given + * cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked. + */ + void unpackDirection( + {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", + "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Copies data between two blocks on the same process. + * PDFs streaming aligned with the direction dir are copied from the sending interval + * onto the receiving interval. + */ + void localCopyDirection( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", + kernels['localCopyDirection'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packDirection / unpackDirection + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + */ + uint_t size (CellInterval & ci, stencil::Direction dir) const { + return ci.numCells() * sizes[dir] * sizeof(value_type); + } + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packAll / unpackAll + * @param ci The cell interval + * @return The required size of the buffer, in bytes + */ + uint_t size (CellInterval & ci) const { + return ci.numCells() * {{stencil_size}} * sizeof(value_type); + } + + {% block AdditionalPublicDeclarations %} + {% endblock %} + + private: + const uint_t sizes[{{direction_sizes|length}}] { {{ direction_sizes | join(', ') }} }; +}; + +} // namespace {{namespace}} +} // namespace walberla diff --git a/python/lbmpy_walberla/utility.py b/python/lbmpy_walberla/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..1289c381e7b50ac7e83d34fca887e6d659959b92 --- /dev/null +++ b/python/lbmpy_walberla/utility.py @@ -0,0 +1,11 @@ +from lbmpy.advanced_streaming import Timestep + + +def timestep_suffix(timestep: Timestep): + """ get the suffix as string for a timestep + + :param timestep: instance of class lbmpy.advanced_streaming.Timestep + :return: either "even", "odd" or an empty string + """ + return ("_" + str(timestep)) if timestep != Timestep.BOTH else '' + diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py index 8566d3915697e28600f54524d7d43e53a98c17b7..e264fb8bbbb8c67040de8c309e40e8b57c0f7053 100644 --- a/python/lbmpy_walberla/walberla_lbm_generation.py +++ b/python/lbmpy_walberla/walberla_lbm_generation.py @@ -1,4 +1,6 @@ # import warnings +from typing import Callable, List + import numpy as np import sympy as sp @@ -18,8 +20,10 @@ from pystencils.node_collection import NodeCollection from pystencils.stencil import offset_to_direction_string from pystencils.sympyextensions import get_symmetric_part from pystencils.typing.transformations import add_types -from pystencils_walberla.codegen import KernelInfo, config_from_context + +from pystencils_walberla.kernel_info import KernelInfo from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.utility import config_from_context cpp_printer = CustomSympyPrinter() REFINEMENT_SCALE_FACTOR = sp.Symbol("level_scale_factor") @@ -155,7 +159,7 @@ def __lattice_model(generation_context, class_name, config, lb_method, stream_co generation_context.write_file(f"{class_name}.cpp", source) -def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='zyxf', refinement_scaling=None, +def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='fzyx', refinement_scaling=None, target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, **create_kernel_params): diff --git a/python/lbmpy_walberla/walberla_lbm_package.py b/python/lbmpy_walberla/walberla_lbm_package.py new file mode 100644 index 0000000000000000000000000000000000000000..e21d6c9613a1c2be87e21cbc06a2a78212f72552 --- /dev/null +++ b/python/lbmpy_walberla/walberla_lbm_package.py @@ -0,0 +1,53 @@ +from typing import Callable, List, Dict + +from pystencils import Target, Field + +from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation +from lbmpy.relaxationrates import get_shear_relaxation_rate + +from pystencils_walberla.cmake_integration import CodeGenerationContext + +from lbmpy_walberla.boundary_collection import generate_boundary_collection +from lbmpy_walberla.storage_specification import generate_lbm_storage_specification +from lbmpy_walberla.sweep_collection import generate_lbm_sweep_collection, RefinementScaling + + +def generate_lbm_package(ctx: CodeGenerationContext, name: str, + collision_rule: LbmCollisionRule, + lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation, + nonuniform: bool = False, boundaries: List[Callable] = None, + macroscopic_fields: Dict[str, Field] = None, + target: Target = Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, + max_threads=None, + **kernel_parameters): + + if macroscopic_fields is None: + macroscopic_fields = {} + + method = collision_rule.method + + storage_spec_name = f'{name}StorageSpecification' + generate_lbm_storage_specification(ctx, storage_spec_name, method, lbm_config, + nonuniform=nonuniform, target=target, data_type=data_type) + + if nonuniform: + omega = get_shear_relaxation_rate(method) + refinement_scaling = RefinementScaling() + refinement_scaling.add_standard_relaxation_rate_scaling(omega) + else: + refinement_scaling = None + + streaming_pattern = lbm_config.streaming_pattern + generate_lbm_sweep_collection(ctx, f'{name}SweepCollection', collision_rule, + streaming_pattern=streaming_pattern, + field_layout=lbm_optimisation.field_layout, + refinement_scaling=refinement_scaling, + macroscopic_fields=macroscopic_fields, + target=target, data_type=data_type, + cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, + max_threads=max_threads, + **kernel_parameters) + + generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries, + lb_method=method, streaming_pattern=streaming_pattern, + target=target, layout=lbm_optimisation.field_layout) diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py index 0ea2d02cb4b93fc880f0addc38058e0363e39c8c..f78f7fcf244e7fd140cd2abcc93ebaebaea2f94f 100644 --- a/python/pystencils_walberla/__init__.py +++ b/python/pystencils_walberla/__init__.py @@ -1,13 +1,17 @@ from .boundary import generate_staggered_boundary, generate_staggered_flux_boundary -from .cmake_integration import CodeGeneration -from .codegen import ( - generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel, - generate_mpidtype_info_from_kernel, generate_sweep, get_vectorize_instruction_set, generate_selective_sweep, - config_from_context) -from .utility import generate_info_header +from .cmake_integration import CodeGeneration, ManualCodeGenerationContext -__all__ = ['CodeGeneration', - 'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info', - 'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary', - 'get_vectorize_instruction_set', 'generate_selective_sweep', 'config_from_context', - 'generate_info_header'] +from .function_generator import function_generator +from .kernel_info import KernelInfo +from .sweep import generate_sweep, generate_selective_sweep, generate_sweep_collection +from .pack_info import (generate_pack_info, generate_pack_info_for_field, + generate_pack_info_from_kernel, generate_mpidtype_info_from_kernel) +from .utility import generate_info_header, get_vectorize_instruction_set, config_from_context + +__all__ = ['generate_staggered_boundary', 'generate_staggered_flux_boundary', + 'CodeGeneration', 'ManualCodeGenerationContext', + 'function_generator', + 'generate_sweep', 'generate_selective_sweep', 'generate_sweep_collection', + 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel', + 'generate_mpidtype_info_from_kernel', + 'generate_info_header', 'get_vectorize_instruction_set', 'config_from_context'] diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py index 4fc9cf6e517d9b513511530eb05b5dab9eb10edd..c5a5e54c1d00d9d6e476306453eae4320b6f5aa8 100644 --- a/python/pystencils_walberla/boundary.py +++ b/python/pystencils_walberla/boundary.py @@ -2,13 +2,10 @@ import numpy as np from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils import Field, FieldType, Target from pystencils.boundaries.boundaryhandling import create_boundary_kernel -from pystencils.boundaries.createindexlist import ( - boundary_index_array_coordinate_names, direction_member_name, - numpy_data_type_for_boundary_object) +from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object from pystencils.typing import TypedSymbol, create_type -from pystencils.stencil import inverse_direction -from pystencils_walberla.codegen import config_from_context +from pystencils_walberla.utility import config_from_context, struct_from_numpy_dtype from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env from pystencils_walberla.additional_data_handler import AdditionalDataHandler from pystencils_walberla.kernel_selection import ( @@ -32,6 +29,7 @@ def generate_boundary(generation_context, interface_mappings=(), generate_functor=True, layout='fzyx', + field_timestep=None, **create_kernel_params): if boundary_object.additional_data and additional_data_handler is None: @@ -75,8 +73,9 @@ def generate_boundary(generation_context, else: raise ValueError(f"kernel_creation_function returned wrong type: {kernel.__class__}") - kernel_family = KernelFamily(selection_tree, class_name) - interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings) + kernel_family = KernelFamily(selection_tree, class_name, field_timestep=field_timestep) + selection_parameters = kernel_family.kernel_selection_parameters if field_timestep is None else [] + interface_spec = HighLevelInterfaceSpec(selection_parameters, interface_mappings) if additional_data_handler is None: additional_data_handler = AdditionalDataHandler(stencil=neighbor_stencil) @@ -97,8 +96,9 @@ def generate_boundary(generation_context, 'inner_or_boundary': boundary_object.inner_or_boundary, 'single_link': boundary_object.single_link, 'additional_data_handler': additional_data_handler, - 'dtype': "float" if is_float else "double", - 'layout': layout + 'dtype': "double" if is_float else "float", + 'layout': layout, + 'index_shape': index_shape } env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) @@ -111,6 +111,8 @@ def generate_boundary(generation_context, generation_context.write_file(f"{class_name}.h", header) generation_context.write_file(f"{class_name}.{source_extension}", source) + return context + def generate_staggered_boundary(generation_context, class_name, boundary_object, dim, neighbor_stencil, index_shape, target=Target.CPU, **kwargs): @@ -126,28 +128,3 @@ def generate_staggered_flux_boundary(generation_context, class_name, boundary_ob FieldType.STAGGERED_FLUX, target=target, **kwargs) -def struct_from_numpy_dtype(struct_name, numpy_dtype): - result = f"struct {struct_name} {{ \n" - - equality_compare = [] - constructor_params = [] - constructor_initializer_list = [] - for name, (sub_type, offset) in numpy_dtype.fields.items(): - pystencils_type = create_type(sub_type) - result += f" {pystencils_type} {name};\n" - if name in boundary_index_array_coordinate_names or name == direction_member_name: - constructor_params.append(f"{pystencils_type} {name}_") - constructor_initializer_list.append(f"{name}({name}_)") - else: - constructor_initializer_list.append(f"{name}()") - if pystencils_type.is_float(): - equality_compare.append(f"floatIsEqual({name}, o.{name})") - else: - equality_compare.append(f"{name} == o.{name}") - - result += " %s(%s) : %s {}\n" % \ - (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list)) - result += " bool operator==(const %s & o) const {\n return %s;\n }\n" % \ - (struct_name, " && ".join(equality_compare)) - result += "};\n" - return result diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py index 932e5ce69dbc8309c8000b53e2fd9a34b21e2f4a..4d5654c08a1474b53852f643c2cf4249a12901db 100644 --- a/python/pystencils_walberla/cmake_integration.py +++ b/python/pystencils_walberla/cmake_integration.py @@ -105,6 +105,12 @@ class ManualCodeGenerationContext: def write_file(self, name, content): self.files[name] = content + def write_all_files(self): + for name, content in self.files.items(): + with open(name, 'w') as f: + f.write(content) + self.files = dict() + def __enter__(self): return self diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index 9e6ada3b86c6d57757d5ca814a481b41a571d0c5..ac475f72c9489d9e7b74ce25d9bf303413ae7834 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -124,7 +124,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int elif target != kernel_family.get_ast_attr('target'): raise ValueError('Mismatch between target parameter and AST targets.') - if not (generation_context.cuda or generation_context.hip) and target == Target.GPU: + if not generation_context.gpu and target == Target.GPU: return representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} diff --git a/python/pystencils_walberla/function_generator.py b/python/pystencils_walberla/function_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7b60803c59d788ce615ceeeb314b5ece0acbe1 --- /dev/null +++ b/python/pystencils_walberla/function_generator.py @@ -0,0 +1,77 @@ +from typing import Sequence, Union + + +from pystencils import Target, Assignment, AssignmentCollection +from pystencils import create_kernel, create_staggered_kernel + +from pystencils_walberla.cmake_integration import CodeGenerationContext +from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec +from pystencils_walberla.utility import config_from_context + + +def function_generator(ctx: CodeGenerationContext, class_name: str, + assignments: Union[Sequence[Assignment], AssignmentCollection], + namespace: str = 'pystencils', staggered=False, field_swaps=None, varying_parameters=(), + ghost_layers_to_include=0, + target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, + max_threads=None, + **create_kernel_params): + return lambda: __function_generator(ctx, class_name, assignments, + namespace, staggered, field_swaps, varying_parameters, + ghost_layers_to_include, + target, data_type, cpu_openmp, cpu_vectorize_info, max_threads, + **create_kernel_params) + + +def __function_generator(ctx: CodeGenerationContext, class_name: str, + assignments: Union[Sequence[Assignment], AssignmentCollection], + namespace: str = 'pystencils', staggered=False, field_swaps=None, varying_parameters=(), + ghost_layers_to_include=0, + target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, + max_threads=None, + **create_kernel_params): + if staggered: + assert 'omp_single_loop' not in create_kernel_params + + create_kernel_params['omp_single_loop'] = False + config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params) + + if not staggered: + ast = create_kernel(assignments, config=config) + else: + # This should not be necessary but create_staggered_kernel does not take a config at the moment ... + ast = create_staggered_kernel(assignments, **config.__dict__) + + ast.function_name = class_name.lower() + + all_field_names = [f.name for f in ast.fields_accessed] + all_field_names.sort() + + temporary_fields = [f for f in all_field_names if "_tmp" in f] + + if field_swaps is None: + field_swaps = [] + for field_name in all_field_names: + if field_name + "_tmp" in temporary_fields: + field_swaps.append((field_name, field_name + "_tmp")) + + selection_tree = KernelCallNode(ast) + kernel_family = KernelFamily(selection_tree, class_name, + temporary_fields, field_swaps, varying_parameters) + + representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} + representative_field = sorted(representative_field)[0] + + interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, ()) + + jinja_context = { + 'kernel': kernel_family, + 'namespace': namespace, + 'function_name': class_name, + 'field': representative_field, + 'ghost_layers_to_include': ghost_layers_to_include, + 'interface_spec': interface_spec, + 'max_threads': max_threads + } + return jinja_context diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index 61ca6f12ac1eabfc6c1be09aad836b424c34aaee..b2413bcefe8c4a4468e4971c1b8901116519c57d 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -4,6 +4,7 @@ try: except ImportError: from jinja2 import contextfilter as jinja2_context_decorator +from collections.abc import Iterable import sympy as sp from pystencils import Target, Backend @@ -43,6 +44,18 @@ delete_loop = """ }} """ +standard_parameter_registration = """ +for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++) +{{ + const {dtype} level_scale_factor = {dtype}(uint_t(1) << level); + const {dtype} one = {dtype}(1.0); + const {dtype} half = {dtype}(0.5); + + {name}Vector.push_back( {dtype}({name} / (level_scale_factor * (-{name} * half + one) + {name} * half)) ); +}} +""" + + # the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the # enum in the template files. def translate_target(target): @@ -61,6 +74,12 @@ def make_field_type(dtype, f_size, is_gpu): return f"field::GhostLayerField<{dtype}, {f_size}>" +def field_type(field, is_gpu=False): + dtype = get_base_type(field.dtype) + f_size = get_field_fsize(field) + return make_field_type(dtype, f_size, is_gpu) + + def get_field_fsize(field): """Determines the size of the index coordinate. Since walberla fields only support one index dimension, pystencils fields with multiple index dimensions are linearized to a single index dimension. @@ -147,35 +166,30 @@ def field_extraction_code(field, is_temporary, declaration_only=False, is_gpu: if the field is a GhostLayerField or a GpuField update_member: specify if function is used inside a constructor; add _ to members """ - # Determine size of f coordinate which is a template parameter - f_size = get_field_fsize(field) - field_name = field.name - dtype = get_base_type(field.dtype) - field_type = make_field_type(dtype, f_size, is_gpu) + wlb_field_type = field_type(field, is_gpu) if not is_temporary: - dtype = get_base_type(field.dtype) - field_type = make_field_type(dtype, f_size, is_gpu) if declaration_only: - return f"{field_type} * {field_name}_;" + return f"{wlb_field_type} * {field.name}_;" else: prefix = "" if no_declaration else "auto " if update_member: - return f"{prefix}{field_name}_ = block->getData< {field_type} >({field_name}ID);" + return f"{prefix}{field.name}_ = block->getData< {wlb_field_type} >({field.name}ID);" else: - return f"{prefix}{field_name} = block->getData< {field_type} >({field_name}ID);" + return f"{prefix}{field.name} = block->getData< {wlb_field_type} >({field.name}ID);" else: - assert field_name.endswith('_tmp') - original_field_name = field_name[:-len('_tmp')] + assert field.name.endswith('_tmp') + original_field_name = field.name[:-len('_tmp')] if declaration_only: - return f"{field_type} * {field_name}_;" + return f"{wlb_field_type} * {field.name}_;" else: - declaration = f"{field_type} * {field_name};" + declaration = f"{wlb_field_type} * {field.name};" tmp_field_str = temporary_fieldTemplate.format(original_field_name=original_field_name, - tmp_field_name=field_name, type=field_type) + tmp_field_name=field.name, type=wlb_field_type) return tmp_field_str if no_declaration else declaration + tmp_field_str +# TODO fields are not sorted @jinja2_context_decorator def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_ignore=(), parameters=None, declarations_only=False, no_declarations=False, update_member=False): @@ -211,11 +225,22 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno return result -def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False): +def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False, + parameter_registration=None): symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer and not ignore_fields} symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter) symbols.difference_update(parameters_to_ignore) - return "\n".join("auto & %s = %s%s_;" % (s, prefix, s) for s in symbols) + type_information = {p.symbol.name: p.symbol.dtype for p in kernel_info.parameters if not p.is_field_parameter} + result = [] + registered_parameters = [] if not parameter_registration else parameter_registration.scaling_info + for s in symbols: + if s in registered_parameters: + dtype = type_information[s].c_name + result.append("const uint_t level = block->getBlockStorage().getLevel(*block);") + result.append(f"{dtype} & {s} = {s}Vector[level];") + else: + result.append(f"auto & {s} = {prefix}{s}_;") + return "\n".join(result) @jinja2_context_decorator @@ -235,7 +260,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st that defines the inner region for the kernel to loop over. Parameter has to be left to default if ghost_layers_to_include is specified. stream: optional name of gpu stream variable - spatial_shape_symbols: relevant only for gpu kernels - to determine CUDA block and grid sizes the iteration + spatial_shape_symbols: relevant only for gpu kernels - to determine GPU block and grid sizes the iteration region (i.e. field shape) has to be known. This can normally be inferred by the kernel parameters - however in special cases like boundary conditions a manual specification may be necessary. @@ -260,33 +285,38 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st required_ghost_layers = 0 else: # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. ) - required_ghost_layers = max(max(kernel_ghost_layers)) + if isinstance(kernel_ghost_layers, int): + required_ghost_layers = kernel_ghost_layers + else: + required_ghost_layers = max(max(kernel_ghost_layers)) kernel_call_lines = [] + def get_cell_interval(field_object): + if isinstance(cell_interval, str): + return cell_interval + elif isinstance(cell_interval, dict): + return cell_interval[field_object] + else: + return None + def get_start_coordinates(field_object): - if cell_interval is None: + ci = get_cell_interval(field_object) + if ci is None: return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions else: assert ghost_layers_to_include == 0 - if field_object.spatial_dimensions == 3: - return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers - for coord_name in ('x', 'y', 'z')] - elif field_object.spatial_dimensions == 2: - return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers - for coord_name in ('x', 'y')] - else: - raise NotImplementedError(f"Only 2D and 3D fields are supported but a field with " - f"{field_object.spatial_dimensions} dimensions was passed") + return [sp.Symbol(f"{ci}.{coord_name}Min()") - required_ghost_layers for coord_name in ('x', 'y', 'z')] def get_end_coordinates(field_object): - if cell_interval is None: + ci = get_cell_interval(field_object) + if ci is None: shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions] offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers - return [f"cell_idx_c({field_object.name}->{e}) + {offset}" for e in shape_names] + return [f"int64_c({field_object.name}->{e}) + {offset}" for e in shape_names] else: assert ghost_layers_to_include == 0 - return [f"cell_idx_c({cell_interval}.{coord_name}Size()) + {2 * required_ghost_layers}" + return [f"int64_c({ci}.{coord_name}Size()) + {2 * required_ghost_layers}" for coord_name in ('x', 'y', 'z')] for param in ast_params: @@ -347,6 +377,38 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st return "\n".join(kernel_call_lines) +@jinja2_context_decorator +def generate_function_collection_call(ctx, kernel_info, parameters_to_ignore=(), cell_interval=None, ghost_layers=None): + target = translate_target(ctx['target']) + is_gpu = target == Target.GPU + + parameters = [] + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_ignore: + parameters.append(param.field_name) + + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: + parameters.append(param.symbol.name) + + # TODO due to backward compatibility with high level interface spec + for parameter in kernel_info.kernel_selection_tree.get_selection_parameter_list(): + if parameter.name not in parameters_to_ignore: + parameters.append(parameter.name) + + if cell_interval: + assert ghost_layers is None, "If a cell interval is specified ghost layers can not be specified" + parameters.append(cell_interval) + + if ghost_layers: + parameters.append(ghost_layers) + + if is_gpu and "gpuStream" not in parameters_to_ignore: + parameters.append(f"gpuStream") + + return ", ".join(parameters) + + def generate_swaps(kernel_info): """Generates code to swap main fields with temporary fields""" swaps = "" @@ -355,119 +417,229 @@ def generate_swaps(kernel_info): return swaps -# TODO: basically 3 times the same code :( -def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=None): - if parameters_to_ignore is None: - parameters_to_ignore = [] +def generate_timestep_advancements(kernel_info, advance=True): + """Generates code to detect even or odd timestep""" + if kernel_info.field_timestep: + field_name = kernel_info.field_timestep["field_name"] + advancement_function = kernel_info.field_timestep["function"] + if advancement_function == "advanceTimestep" and advance is False: + advancement_function = "getTimestepPlusOne" + return f"uint8_t timestep = {field_name}->{advancement_function}();" + return "" - varying_parameter_names = [] - if hasattr(kernel_info, 'varying_parameters'): - varying_parameter_names = tuple(e[1] for e in kernel_info.varying_parameters) - parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names + +def generate_constructor_initializer_list(kernel_infos, parameters_to_ignore=None, parameter_registration=None): + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + + parameters_to_skip = [] + if parameters_to_ignore is not None: + parameters_to_skip = [p for p in parameters_to_ignore] + + for kernel_info in kernel_infos: + parameters_to_skip += kernel_info.temporary_fields parameter_initializer_list = [] # First field pointer - for param in kernel_info.parameters: - if param.is_field_pointer and param.field_name not in parameters_to_ignore: - parameter_initializer_list.append(f"{param.field_name}ID({param.field_name}ID_)") + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_skip: + parameter_initializer_list.append(f"{param.field_name}ID({param.field_name}ID_)") + parameters_to_skip.append(param.field_name) # Then free parameters - for param in kernel_info.parameters: - if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: - parameter_initializer_list.append(f"{param.symbol.name}_({param.symbol.name})") + if parameter_registration is not None: + parameters_to_skip.extend(parameter_registration.scaling_info) + + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name not in parameters_to_skip: + parameter_initializer_list.append(f"{param.symbol.name}_({param.symbol.name})") + parameters_to_skip.append(param.symbol.name) return ", ".join(parameter_initializer_list) -def generate_constructor_parameters(kernel_info, parameters_to_ignore=None): - if parameters_to_ignore is None: - parameters_to_ignore = [] +# TODO check varying_parameters +def generate_constructor_parameters(kernel_infos, parameters_to_ignore=None): + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + + parameters_to_skip = [] + if parameters_to_ignore is not None: + parameters_to_skip = [p for p in parameters_to_ignore] varying_parameters = [] - if hasattr(kernel_info, 'varying_parameters'): - varying_parameters = kernel_info.varying_parameters - varying_parameter_names = tuple(e[1] for e in varying_parameters) - parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names + for kernel_info in kernel_infos: + if hasattr(kernel_info, 'varying_parameters'): + varying_parameters = kernel_info.varying_parameters + varying_parameter_names = tuple(e[1] for e in varying_parameters) + parameters_to_skip += kernel_info.temporary_fields + varying_parameter_names parameter_list = [] # First field pointer - for param in kernel_info.parameters: - if param.is_field_pointer and param.field_name not in parameters_to_ignore: - parameter_list.append(f"BlockDataID {param.field_name}ID_") + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_skip: + parameter_list.append(f"BlockDataID {param.field_name}ID_") + parameters_to_skip.append(param.field_name) # Then free parameters - for param in kernel_info.parameters: - if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: - parameter_list.append(f"{param.symbol.dtype} {param.symbol.name}") + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name not in parameters_to_skip: + parameter_list.append(f"{param.symbol.dtype} {param.symbol.name}") + parameters_to_skip.append(param.symbol.name) varying_parameters = ["%s %s" % e for e in varying_parameters] return ", ".join(parameter_list + varying_parameters) def generate_constructor_call_arguments(kernel_info, parameters_to_ignore=None): - if parameters_to_ignore is None: - parameters_to_ignore = [] + parameters_to_skip = [] + if parameters_to_ignore is not None: + parameters_to_skip = [p for p in parameters_to_ignore] varying_parameters = [] if hasattr(kernel_info, 'varying_parameters'): varying_parameters = kernel_info.varying_parameters varying_parameter_names = tuple(e[1] for e in varying_parameters) - parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names + parameters_to_skip += kernel_info.temporary_fields + varying_parameter_names parameter_list = [] for param in kernel_info.parameters: - if param.is_field_pointer and param.field_name not in parameters_to_ignore: + if param.is_field_pointer and param.field_name not in parameters_to_skip: parameter_list.append(f"{param.field_name}ID") - elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: + elif not param.is_field_parameter and param.symbol.name not in parameters_to_skip: parameter_list.append(f'{param.symbol.name}_') varying_parameters = [f"{e}_" for e in varying_parameter_names] return ", ".join(parameter_list + varying_parameters) @jinja2_context_decorator -def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False): - fields = {f.name: f for f in kernel_info.fields_accessed} +def generate_members(ctx, kernel_infos, parameters_to_ignore=None, only_fields=False, parameter_registration=None): + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + + if parameters_to_ignore is None: + parameters_to_ignore = [] + + params_to_skip = [p for p in parameters_to_ignore] + + fields = dict() + for kernel_info in kernel_infos: + for field in kernel_info.fields_accessed: + fields[field.name] = field + + varying_parameters = [] + for kernel_info in kernel_infos: + if hasattr(kernel_info, 'varying_parameters'): + varying_parameters = kernel_info.varying_parameters + varying_parameter_names = tuple(e[1] for e in varying_parameters) + params_to_skip += kernel_info.temporary_fields + params_to_skip += varying_parameter_names - params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields) - params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters) target = translate_target(ctx['target']) is_gpu = target == Target.GPU result = [] - for param in kernel_info.parameters: - if only_fields and not param.is_field_parameter: - continue - if param.is_field_pointer and param.field_name not in params_to_skip: - result.append(f"BlockDataID {param.field_name}ID;") + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if only_fields and not param.is_field_parameter: + continue + if param.is_field_pointer and param.field_name not in params_to_skip: + result.append(f"BlockDataID {param.field_name}ID;") + params_to_skip.append(param.field_name) + + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if only_fields and not param.is_field_parameter: + continue + if not param.is_field_parameter and param.symbol.name not in params_to_skip: + if parameter_registration and param.symbol.name in parameter_registration.scaling_info: + result.append(f"std::vector<{param.symbol.dtype}> {param.symbol.name}Vector;") + else: + result.append(f"{param.symbol.dtype} {param.symbol.name}_;") + params_to_skip.append(param.symbol.name) + + for kernel_info in kernel_infos: + for field_name in kernel_info.temporary_fields: + f = fields[field_name] + if field_name in parameters_to_ignore: + continue + parameters_to_ignore.append(field_name) + assert field_name.endswith('_tmp') + original_field_name = field_name[:-len('_tmp')] + f_size = get_field_fsize(f) + field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu) + result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name)) + + for kernel_info in kernel_infos: + if hasattr(kernel_info, 'varying_parameters'): + result.extend(["%s %s_;" % e for e in kernel_info.varying_parameters]) + return "\n".join(result) + + +@jinja2_context_decorator +def generate_plain_parameter_list(ctx, kernel_info, cell_interval=None, ghost_layers=None, stream=None): + fields = {f.name: f for f in kernel_info.fields_accessed} + target = translate_target(ctx['target']) + is_gpu = target == Target.GPU + + result = [] for param in kernel_info.parameters: - if only_fields and not param.is_field_parameter: + if not param.is_field_parameter: continue - if not param.is_field_parameter and param.symbol.name not in params_to_skip: - result.append(f"{param.symbol.dtype} {param.symbol.name}_;") + if param.is_field_pointer and param.field_name: + f = fields[param.field_name] + f_size = get_field_fsize(f) + field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu) + result.append(f"{field_type} * {param.field_name}") - for field_name in kernel_info.temporary_fields: - f = fields[field_name] - if field_name in parameters_to_ignore: - continue - assert field_name.endswith('_tmp') - original_field_name = field_name[:-len('_tmp')] - f_size = get_field_fsize(f) - field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu) - result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name)) + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name: + result.append(f"{param.symbol.dtype} {param.symbol.name}") if hasattr(kernel_info, 'varying_parameters'): result.extend(["%s %s_;" % e for e in kernel_info.varying_parameters]) - return "\n".join(result) + # TODO due to backward compatibility with high level interface spec + for parameter in kernel_info.kernel_selection_tree.get_selection_parameter_list(): + result.append(f"{parameter.dtype} {parameter.name}") + + if cell_interval: + result.append(f"const CellInterval & {cell_interval}") + + if ghost_layers is not None: + if type(ghost_layers) in (int, ): + result.append(f"const cell_idx_t ghost_layers = {ghost_layers}") + else: + result.append(f"const cell_idx_t ghost_layers") + + if is_gpu: + if stream is not None: + result.append(f"gpuStream_t stream = {stream}") + else: + result.append(f"gpuStream_t stream") + + return ", ".join(result) -def generate_destructor(kernel_info, class_name): - if not kernel_info.temporary_fields: +def generate_destructor(kernel_infos, class_name): + temporary_fields = [] + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + for kernel_info in kernel_infos: + for tmp_field in kernel_info.temporary_fields: + if tmp_field not in temporary_fields: + temporary_fields.append(tmp_field) + + if not temporary_fields: return "" else: contents = "" - for field_name in kernel_info.temporary_fields: + for field_name in temporary_fields: contents += delete_loop.format(original_field_name=field_name[:-len('_tmp')]) return temporary_constructor.format(contents=contents, class_name=class_name) @@ -502,6 +674,47 @@ def nested_class_method_definition_prefix(ctx, nested_class_name): return f"{outer_class}::{nested_class_name}" +@jinja2_context_decorator +def generate_parameter_registration(ctx, kernel_infos, parameter_registration): + if parameter_registration is None: + return "" + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + + params_to_skip = [] + result = [] + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name not in params_to_skip: + if param.symbol.name in parameter_registration.scaling_info: + result.append(standard_parameter_registration.format(dtype=param.symbol.dtype, + name=param.symbol.name)) + params_to_skip.append(param.symbol.name) + + return "\n".join(result) + + +@jinja2_context_decorator +def generate_constructor(ctx, kernel_infos, parameter_registration): + if parameter_registration is None: + return "" + if not isinstance(kernel_infos, Iterable): + kernel_infos = [kernel_infos] + + params_to_skip = [] + result = [] + for kernel_info in kernel_infos: + for param in kernel_info.parameters: + if not param.is_field_parameter and param.symbol.name not in params_to_skip: + if param.symbol.name in parameter_registration.scaling_info: + name = param.symbol.name + dtype = param.symbol.dtype + result.append(standard_parameter_registration.format(dtype=dtype, name=name)) + params_to_skip.append(name) + + return "\n".join(result) + + def generate_list_of_expressions(expressions, prepend=''): if len(expressions) == 0: return '' @@ -518,7 +731,7 @@ def type_identifier_list(nested_arg_list): def recursive_flatten(arg_list): for s in arg_list: - if isinstance(s, str): + if isinstance(s, str) and len(s) > 0: result.append(s) elif isinstance(s, TypedSymbol): result.append(f"{s.dtype} {s.name}") @@ -555,16 +768,22 @@ def add_pystencils_filters_to_jinja_env(jinja_env): jinja_env.filters['generate_definitions'] = generate_definitions jinja_env.filters['generate_declarations'] = generate_declarations jinja_env.filters['generate_members'] = generate_members + jinja_env.filters['generate_plain_parameter_list'] = generate_plain_parameter_list jinja_env.filters['generate_constructor_parameters'] = generate_constructor_parameters jinja_env.filters['generate_constructor_initializer_list'] = generate_constructor_initializer_list jinja_env.filters['generate_constructor_call_arguments'] = generate_constructor_call_arguments jinja_env.filters['generate_call'] = generate_call + jinja_env.filters['generate_function_collection_call'] = generate_function_collection_call jinja_env.filters['generate_block_data_to_field_extraction'] = generate_block_data_to_field_extraction + jinja_env.filters['generate_timestep_advancements'] = generate_timestep_advancements jinja_env.filters['generate_swaps'] = generate_swaps jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters jinja_env.filters['generate_destructor'] = generate_destructor jinja_env.filters['generate_field_type'] = generate_field_type jinja_env.filters['nested_class_method_definition_prefix'] = nested_class_method_definition_prefix + jinja_env.filters['generate_parameter_registration'] = generate_parameter_registration + jinja_env.filters['generate_constructor'] = generate_constructor jinja_env.filters['type_identifier_list'] = type_identifier_list jinja_env.filters['identifier_list'] = identifier_list jinja_env.filters['list_of_expressions'] = generate_list_of_expressions + jinja_env.filters['field_type'] = field_type diff --git a/python/pystencils_walberla/kernel_info.py b/python/pystencils_walberla/kernel_info.py new file mode 100644 index 0000000000000000000000000000000000000000..1382d94f4220495da28bf02113636fdf8addbaf1 --- /dev/null +++ b/python/pystencils_walberla/kernel_info.py @@ -0,0 +1,67 @@ +from functools import reduce + +from pystencils import Target + +from pystencils.backends.cbackend import get_headers +from pystencils.backends.cuda_backend import CudaSympyPrinter +from pystencils.typing.typed_sympy import SHAPE_DTYPE +from pystencils.typing import TypedSymbol + +from pystencils_walberla.utility import merge_sorted_lists + + +# TODO KernelInfo and KernelFamily should have same interface +class KernelInfo: + def __init__(self, ast, temporary_fields=(), field_swaps=(), varying_parameters=()): + self.ast = ast + self.temporary_fields = tuple(temporary_fields) + self.field_swaps = tuple(field_swaps) + self.varying_parameters = tuple(varying_parameters) + self.parameters = ast.get_parameters() # cache parameters here + + @property + def fields_accessed(self): + return self.ast.fields_accessed + + def get_ast_attr(self, name): + """Returns the value of an attribute of the AST managed by this KernelInfo. + For compatibility with KernelFamily.""" + return self.ast.__getattribute__(name) + + def get_headers(self): + all_headers = [list(get_headers(self.ast))] + return reduce(merge_sorted_lists, all_headers) + + def generate_kernel_invocation_code(self, **kwargs): + ast = self.ast + ast_params = self.parameters + is_cpu = self.ast.target == Target.CPU + call_parameters = ", ".join([p.symbol.name for p in ast_params]) + + if not is_cpu: + stream = kwargs.get('stream', '0') + spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ()) + + if not spatial_shape_symbols: + spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] + spatial_shape_symbols.sort(key=lambda e: e.coordinate) + else: + spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] + + assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\ + "Please only use kernels for generic field sizes!" + + indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) + sp_printer_c = CudaSympyPrinter() + kernel_call_lines = [ + "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['block']), + "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['grid']), + "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name, + stream, call_parameters), + ] + + return "\n".join(kernel_call_lines) + else: + return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py index c62f441775edc763f4ad41b2e9c218a5b86930d8..c946f85105185159e317ea18d4740667cd7761c7 100644 --- a/python/pystencils_walberla/kernel_selection.py +++ b/python/pystencils_walberla/kernel_selection.py @@ -8,6 +8,8 @@ from pystencils.backends.cbackend import get_headers from pystencils.backends.cuda_backend import CudaSympyPrinter from pystencils.typing.typed_sympy import SHAPE_DTYPE +from pystencils_walberla.utility import merge_lists_of_symbols, merge_sorted_lists + """ @@ -120,6 +122,41 @@ class AbstractConditionNode(AbstractKernelSelectionNode, ABC): return code +class SwitchNode(AbstractKernelSelectionNode): + def __init__(self, parameter_symbol, cases_dict): + self.cases_dict = cases_dict + self.parameter_symbol = parameter_symbol + + @property + def selection_parameters(self): + return {self.parameter_symbol} + + def collect_kernel_calls(self): + return reduce(lambda x, y: x | y.collect_kernel_calls(), self.cases_dict.values(), set()) + + def collect_selection_parameters(self): + return reduce(lambda x, y: x | y.collect_selection_parameters(), + self.cases_dict.values(), + self.selection_parameters) + + def get_code(self, **kwargs): + def case_code(case, subtree): + code = f"case {case} : {{\n" + code += do_indent(subtree.get_code(**kwargs), width=4, first=True) + code += "\n break;\n}" + return code + + cases = [case_code(k, v) for k, v in self.cases_dict.items()] + switch_code = f"switch ({self.parameter_symbol.name}) {{\n" + + switch_body = '\n'.join(cases) + switch_body = do_indent(switch_body, width=4, first=True) + + switch_code += switch_body + switch_code += "default: break; \n}" + return switch_code + + class KernelCallNode(AbstractKernelSelectionNode): def __init__(self, ast): self.ast = ast @@ -192,22 +229,29 @@ class SimpleBooleanCondition(AbstractConditionNode): class KernelFamily: def __init__(self, kernel_selection_tree: AbstractKernelSelectionNode, class_name: str, - temporary_fields=(), field_swaps=(), varying_parameters=()): + temporary_fields=(), field_swaps=(), varying_parameters=(), + field_timestep=None): self.kernel_selection_tree = kernel_selection_tree self.kernel_selection_parameters = kernel_selection_tree.get_selection_parameter_list() self.temporary_fields = tuple(temporary_fields) self.field_swaps = tuple(field_swaps) + self.field_timestep = field_timestep self.varying_parameters = tuple(varying_parameters) all_kernel_calls = self.kernel_selection_tree.collect_kernel_calls() all_param_lists = [k.parameters for k in all_kernel_calls] asts_list = [k.ast for k in all_kernel_calls] self.representative_ast = asts_list[0] + self.target = self.representative_ast.target # Eliminate duplicates self.all_asts = set(asts_list) - # Check function names for uniqueness and reformat them + # TODO due to backward compatibility with high level interface spec + if self.field_timestep is not None: + self.kernel_selection_parameters = [] + + # Check function names for uniqueness and reformat them # using the class name function_names = [ast.function_name.lower() for ast in self.all_asts] unique_names = set(function_names) @@ -258,7 +302,7 @@ class AbstractInterfaceArgumentMapping: raise NotImplementedError() @property - def headers(self): + def headers(self) -> Set: return set() @@ -312,34 +356,4 @@ class HighLevelInterfaceSpec: # ---------------------------------- Helpers -------------------------------------------------------------------------- -def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None): - if identity_check_key is None: - identity_check_key = sort_key - nx = len(lx) - ny = len(ly) - - def recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern): - if ix_intern == nx: - return ly_intern[iy_intern:] - if iy_intern == ny: - return lx_intern[ix_intern:] - x = lx_intern[ix_intern] - y = ly_intern[iy_intern] - skx = sort_key(x) - sky = sort_key(y) - if skx == sky: - if identity_check_key(x) == identity_check_key(y): - return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern + 1) - else: - raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!') - elif skx < sky: - return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern) - else: - return [y] + recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern + 1) - return recursive_merge(lx, ly, 0, 0) - -def merge_lists_of_symbols(lists): - def merger(lx, ly): - return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol) - return reduce(merger, lists) diff --git a/python/pystencils_walberla/pack_info.py b/python/pystencils_walberla/pack_info.py new file mode 100644 index 0000000000000000000000000000000000000000..221a946e004143f0f02c3a2663df6726add4027f --- /dev/null +++ b/python/pystencils_walberla/pack_info.py @@ -0,0 +1,288 @@ +from collections import OrderedDict, defaultdict +from dataclasses import replace +from itertools import product +from typing import Dict, Optional, Sequence, Tuple + +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils import Assignment, AssignmentCollection, Field, FieldType, Target, create_kernel +from pystencils.backends.cbackend import get_headers +from pystencils.stencil import inverse_direction, offset_to_direction_string + +from pystencils_walberla.cmake_integration import CodeGenerationContext +from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.kernel_info import KernelInfo +from pystencils_walberla.utility import config_from_context + + +def generate_pack_info_for_field(ctx: CodeGenerationContext, class_name: str, field: Field, + direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None, + operator=None, gl_to_inner=False, + target=Target.CPU, data_type=None, cpu_openmp=False, + **create_kernel_params): + """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements. + + Args: + ctx: see documentation of `generate_sweep` + class_name: name of the generated class + field: pystencils field for which to generate pack info + direction_subset: optional sequence of directions for which values should be packed + otherwise a D3Q27 stencil is assumed + operator: optional operator for, e.g., reduction pack infos + gl_to_inner: communicates values from ghost layers of sender to interior of receiver + target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target + data_type: default datatype for the kernel creation. Default is double + cpu_openmp: if loops should use openMP or not. + **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` + """ + + if not direction_subset: + direction_subset = tuple((i, j, k) for i, j, k in product(*[(-1, 0, 1)] * 3)) + + all_index_accesses = [field(*ind) for ind in product(*[range(s) for s in field.index_shape])] + return generate_pack_info(ctx, class_name, {direction_subset: all_index_accesses}, operator=operator, + gl_to_inner=gl_to_inner, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + **create_kernel_params) + + +def generate_pack_info_from_kernel(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment], + kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False, + **create_kernel_params): + """Generates a waLBerla GPU PackInfo from a (pull) kernel. + + Args: + ctx: see documentation of `generate_sweep` + class_name: name of the generated class + assignments: list of assignments from the compute kernel - generates PackInfo for "pull" part only + i.e. the kernel is expected to only write to the center + kind: can either be pull or push + operator: optional operator for, e.g., reduction pack infos + target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target + data_type: default datatype for the kernel creation. Default is double + cpu_openmp: if loops should use openMP or not. + **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` + """ + assert kind in ('push', 'pull') + reads = set() + writes = set() + + if isinstance(assignments, AssignmentCollection): + assignments = assignments.all_assignments + + for a in assignments: + if not isinstance(a, Assignment): + continue + reads.update(a.rhs.atoms(Field.Access)) + writes.update(a.lhs.atoms(Field.Access)) + spec = defaultdict(set) + if kind == 'pull': + for fa in reads: + assert all(abs(e) <= 1 for e in fa.offsets) + if all(offset == 0 for offset in fa.offsets): + continue + comm_direction = inverse_direction(fa.offsets) + for comm_dir in _comm_directions(comm_direction): + spec[(comm_dir,)].add(fa.field.center(*fa.index)) + elif kind == 'push': + for fa in writes: + assert all(abs(e) <= 1 for e in fa.offsets) + if all(offset == 0 for offset in fa.offsets): + continue + for comm_dir in _comm_directions(fa.offsets): + spec[(comm_dir,)].add(fa) + else: + raise ValueError("Invalid 'kind' parameter") + return generate_pack_info(ctx, class_name, spec, operator=operator, + target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params) + + +def generate_pack_info(ctx: CodeGenerationContext, class_name: str, + directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]], + namespace='pystencils', operator=None, gl_to_inner=False, + target=Target.CPU, data_type=None, cpu_openmp=False, + **create_kernel_params): + """Generates a waLBerla GPU PackInfo + + Args: + ctx: see documentation of `generate_sweep` + class_name: name of the generated class + directions_to_pack_terms: maps tuples of directions to read field accesses, specifying which values have to be + packed for which direction + namespace: inner namespace of the generated class + operator: optional operator for, e.g., reduction pack infos + gl_to_inner: communicates values from ghost layers of sender to interior of receiver + target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target + data_type: default datatype for the kernel creation. Default is double + cpu_openmp: if loops should use openMP or not. + **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` + """ + if cpu_openmp: + raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus " + "additionally parallelising each kernel is not supported.") + items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()] + items = sorted(items, key=lambda e: e[0]) + directions_to_pack_terms = OrderedDict(items) + + config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + **create_kernel_params) + + config_zero_gl = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + ghost_layers=0, **create_kernel_params) + + # Vectorisation of the pack info is not implemented. + config = replace(config, cpu_vectorize_info=None) + config_zero_gl = replace(config_zero_gl, cpu_vectorize_info=None) + + config = replace(config, allow_double_writes=True) + config_zero_gl = replace(config_zero_gl, allow_double_writes=True) + + template_name = "CpuPackInfo.tmpl" if config.target == Target.CPU else 'GpuPackInfo.tmpl' + + fields_accessed = set() + for terms in directions_to_pack_terms.values(): + for term in terms: + assert isinstance(term, Field.Access) # and all(e == 0 for e in term.offsets) + fields_accessed.add(term) + + field_names = {fa.field.name for fa in fields_accessed} + + data_types = {fa.field.dtype for fa in fields_accessed} + if len(data_types) == 0: + raise ValueError("No fields to pack!") + if len(data_types) != 1: + err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed) + raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail) + dtype = data_types.pop() + + pack_kernels = OrderedDict() + unpack_kernels = OrderedDict() + all_accesses = set() + elements_per_cell = OrderedDict() + for direction_set, terms in directions_to_pack_terms.items(): + for d in direction_set: + if not all(abs(i) <= 1 for i in d): + raise NotImplementedError("Only first neighborhood supported") + + buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER, + dtype=dtype.numpy_dtype, index_shape=(len(terms),)) + + direction_strings = tuple(offset_to_direction_string(d) for d in direction_set) + all_accesses.update(terms) + + pack_assignments = [Assignment(buffer(i), term) for i, term in enumerate(terms)] + pack_ast = create_kernel(pack_assignments, config=config_zero_gl) + pack_ast.function_name = 'pack_{}'.format("_".join(direction_strings)) + if operator is None: + unpack_assignments = [Assignment(term, buffer(i)) for i, term in enumerate(terms)] + else: + unpack_assignments = [Assignment(term, operator(term, buffer(i))) for i, term in enumerate(terms)] + unpack_ast = create_kernel(unpack_assignments, config=config_zero_gl) + unpack_ast.function_name = 'unpack_{}'.format("_".join(direction_strings)) + + pack_kernels[direction_strings] = KernelInfo(pack_ast) + unpack_kernels[direction_strings] = KernelInfo(unpack_ast) + elements_per_cell[direction_strings] = len(terms) + fused_kernel = create_kernel([Assignment(buffer.center, t) for t in all_accesses], config=config) + + jinja_context = { + 'class_name': class_name, + 'pack_kernels': pack_kernels, + 'unpack_kernels': unpack_kernels, + 'fused_kernel': KernelInfo(fused_kernel), + 'elements_per_cell': elements_per_cell, + 'headers': get_headers(fused_kernel), + 'target': config.target.name.lower(), + 'dtype': dtype, + 'field_name': field_names.pop(), + 'namespace': namespace, + 'gl_to_inner': gl_to_inner, + } + env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) + add_pystencils_filters_to_jinja_env(env) + header = env.get_template(template_name + ".h").render(**jinja_context) + source = env.get_template(template_name + ".cpp").render(**jinja_context) + + source_extension = "cpp" if config.target == Target.CPU else "cu" + ctx.write_file(f"{class_name}.h", header) + ctx.write_file(f"{class_name}.{source_extension}", source) + + +def generate_mpidtype_info_from_kernel(ctx: CodeGenerationContext, class_name: str, + assignments: Sequence[Assignment], kind='pull', namespace='pystencils'): + assert kind in ('push', 'pull') + reads = set() + writes = set() + + if isinstance(assignments, AssignmentCollection): + assignments = assignments.all_assignments + + for a in assignments: + if not isinstance(a, Assignment): + continue + reads.update(a.rhs.atoms(Field.Access)) + writes.update(a.lhs.atoms(Field.Access)) + + spec = defaultdict(set) + if kind == 'pull': + read_fields = set(fa.field for fa in reads) + assert len(read_fields) == 1, "Only scenarios where one fields neighbors are accessed" + field = read_fields.pop() + for fa in reads: + assert all(abs(e) <= 1 for e in fa.offsets) + if all(offset == 0 for offset in fa.offsets): + continue + comm_direction = inverse_direction(fa.offsets) + for comm_dir in _comm_directions(comm_direction): + assert len(fa.index) == 1, "Supports only fields with a single index dimension" + spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0]) + elif kind == 'push': + written_fields = set(fa.field for fa in writes) + assert len(written_fields) == 1, "Only scenarios where one fields neighbors are accessed" + field = written_fields.pop() + + for fa in writes: + assert all(abs(e) <= 1 for e in fa.offsets) + if all(offset == 0 for offset in fa.offsets): + continue + for comm_dir in _comm_directions(fa.offsets): + assert len(fa.index) == 1, "Supports only fields with a single index dimension" + spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0]) + else: + raise ValueError("Invalid 'kind' parameter") + + jinja_context = { + 'class_name': class_name, + 'namespace': namespace, + 'kind': kind, + 'field_name': field.name, + 'f_size': field.index_shape[0], + 'spec': spec, + } + env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) + header = env.get_template("MpiDtypeInfo.tmpl.h").render(**jinja_context) + ctx.write_file(f"{class_name}.h", header) + + +# ---------------------------------- Internal -------------------------------------------------------------------------- + +def _comm_directions(direction): + if all(e == 0 for e in direction): + yield direction + binary_numbers_list = binary_numbers(len(direction)) + for comm_direction in binary_numbers_list: + for i in range(len(direction)): + if direction[i] == 0: + comm_direction[i] = 0 + if direction[i] == -1 and comm_direction[i] == 1: + comm_direction[i] = -1 + if not all(e == 0 for e in comm_direction): + yield tuple(comm_direction) + + +def binary_numbers(n): + result = list() + for i in range(1 << n): + binary_number = bin(i)[2:] + binary_number = '0' * (n - len(binary_number)) + binary_number + result.append((list(map(int, binary_number)))) + return result diff --git a/python/pystencils_walberla/sweep.py b/python/pystencils_walberla/sweep.py new file mode 100644 index 0000000000000000000000000000000000000000..ddf9a2a52b0de504394becdf99127a06f866383d --- /dev/null +++ b/python/pystencils_walberla/sweep.py @@ -0,0 +1,199 @@ +from typing import Callable, Sequence + +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils import Target, Assignment +from pystencils import Field, create_kernel, create_staggered_kernel +from pystencils.astnodes import KernelFunction + +from pystencils_walberla.cmake_integration import CodeGenerationContext +from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec +from pystencils_walberla.utility import config_from_context + + +def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment], + namespace: str = 'pystencils', field_swaps=(), staggered=False, varying_parameters=(), + inner_outer_split=False, ghost_layers_to_include=0, + target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None, + **create_kernel_params): + """Generates a waLBerla sweep from a pystencils representation. + + The constructor of the C++ sweep class expects all kernel parameters (fields and parameters) in alphabetical order. + Fields have to passed using BlockDataID's pointing to walberla fields + + Args: + ctx: build system context filled with information from waLBerla's CMake. The context for example + defines where to write generated files, if OpenMP is available or which SIMD instruction + set should be used. See waLBerla examples on how to get a context. + class_name: name of the generated sweep class + assignments: list of assignments defining the stencil update rule or a :class:`KernelFunction` + namespace: the generated class is accessible as walberla::<namespace>::<class_name> + field_swaps: sequence of field pairs (field, temporary_field). The generated sweep only gets the first field + as argument, creating a temporary field internally which is swapped with the first field after + each iteration. + staggered: set to True to create staggered kernels with `pystencils.create_staggered_kernel` + varying_parameters: Depending on the configuration, the generated kernels may receive different arguments for + different setups. To not have to adapt the C++ application when then parameter change, + the varying_parameters sequence can contain parameter names, which are always expected by + the C++ class constructor even if the kernel does not need them. + inner_outer_split: if True generate a sweep that supports separate iteration over inner and outer regions + to allow for communication hiding. + ghost_layers_to_include: determines how many ghost layers should be included for the Sweep. + This is relevant if a setter kernel should also set correct values to the ghost layers. + target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target + data_type: default datatype for the kernel creation. Default is double + cpu_openmp: if loops should use openMP or not. + cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set. + max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` + **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` + """ + if staggered: + assert 'omp_single_loop' not in create_kernel_params + create_kernel_params['omp_single_loop'] = False + config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params) + + if isinstance(assignments, KernelFunction): + ast = assignments + target = ast.target + elif not staggered: + ast = create_kernel(assignments, config=config) + else: + # This should not be necessary but create_staggered_kernel does not take a config at the moment ... + ast = create_staggered_kernel(assignments, **config.__dict__) + + ast.function_name = class_name.lower() + + selection_tree = KernelCallNode(ast) + generate_selective_sweep(ctx, class_name, selection_tree, target=target, namespace=namespace, + field_swaps=field_swaps, varying_parameters=varying_parameters, + inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, + cpu_vectorize_info=config.cpu_vectorize_info, + cpu_openmp=config.cpu_openmp, max_threads=max_threads) + + +def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings=(), target=None, + namespace='pystencils', field_swaps=(), varying_parameters=(), + inner_outer_split=False, ghost_layers_to_include=0, + cpu_vectorize_info=None, cpu_openmp=False, max_threads=None): + """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple + pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`. + + Args: + ctx: see documentation of `generate_sweep` + class_name: name of the generated sweep class + selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree + interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of + the selection tree + target: `None`, `Target.CPU` or `Target.GPU`; inferred from kernels if `None` is given. + namespace: see documentation of `generate_sweep` + field_swaps: see documentation of `generate_sweep` + varying_parameters: see documentation of `generate_sweep` + inner_outer_split: see documentation of `generate_sweep` + ghost_layers_to_include: see documentation of `generate_sweep` + cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels + cpu_openmp: Whether or not CPU kernels use OpenMP parallelization + max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` + """ + def to_name(f): + return f.name if isinstance(f, Field) else f + + field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps) + temporary_fields = tuple(e[1] for e in field_swaps) + + kernel_family = KernelFamily(selection_tree, class_name, + temporary_fields, field_swaps, varying_parameters) + + if target is None: + target = kernel_family.get_ast_attr('target') + elif target != kernel_family.get_ast_attr('target'): + raise ValueError('Mismatch between target parameter and AST targets.') + + if not ctx.gpu and target == Target.GPU: + return + + representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} + representative_field = sorted(representative_field)[0] + + env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) + add_pystencils_filters_to_jinja_env(env) + + interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings) + + jinja_context = { + 'kernel': kernel_family, + 'namespace': namespace, + 'class_name': class_name, + 'target': target.name.lower(), + 'field': representative_field, + 'ghost_layers_to_include': ghost_layers_to_include, + 'inner_outer_split': inner_outer_split, + 'interface_spec': interface_spec, + 'generate_functor': True, + 'cpu_vectorize_info': cpu_vectorize_info, + 'cpu_openmp': cpu_openmp, + 'max_threads': max_threads + } + header = env.get_template("Sweep.tmpl.h").render(**jinja_context) + source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) + + source_extension = "cpp" if target == Target.CPU else "cu" + ctx.write_file(f"{class_name}.h", header) + ctx.write_file(f"{class_name}.{source_extension}", source) + + +def generate_sweep_collection(ctx, class_name: str, function_generators: Sequence[Callable], parameter_scaling=None): + """Generates a sweep collection + """ + + contexts_function_generators = list() + for fct in function_generators: + contexts_function_generators.append(fct()) + + namespaces = set([context['namespace'] for context in contexts_function_generators]) + assert len(namespaces) == 1, "All function_generators must output the same namespace!" + namespace = namespaces.pop() + + headers = set() + for context in contexts_function_generators: + for header in context['interface_spec'].headers: + headers.add(header) + for header in context['kernel'].get_headers(): + headers.add(header) + + kernel_list = list() + for context in contexts_function_generators: + kernel_list.append(context['kernel']) + + kernels = list() + for context in contexts_function_generators: + kernels.append({ + 'kernel': context['kernel'], + 'function_name': context['function_name'], + 'ghost_layers_to_include': 'ghost_layers', + 'field': context['field'], + 'max_threads': context['max_threads'] + }) + + target = kernels[0]['kernel'].target + + jinja_context = { + 'kernel_list': kernel_list, + 'kernels': kernels, + 'namespace': namespace, + 'class_name': class_name, + 'headers': headers, + 'target': target.name.lower(), + 'parameter_scaling': parameter_scaling, + } + + env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) + add_pystencils_filters_to_jinja_env(env) + + header = env.get_template("SweepCollection.tmpl.h").render(**jinja_context) + source = env.get_template("SweepCollection.tmpl.cpp").render(**jinja_context) + + source_extension = "cpp" if target == Target.CPU else "cu" + ctx.write_file(f"{class_name}.h", header) + ctx.write_file(f"{class_name}.{source_extension}", source) diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.cpp b/python/pystencils_walberla/templates/Boundary.tmpl.cpp index b4fe6df4794d61f06c2a1a900879871e7ac9f14d..644202ba67cd574724e46ef2b42e60535dc2e5c6 100644 --- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp +++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp @@ -17,8 +17,6 @@ //! \\author pystencils //====================================================================================================================== -#include <cmath> - #include "core/DataTypes.h" #include "core/Macros.h" #include "{{class_name}}.h" @@ -53,9 +51,9 @@ namespace {{namespace}} { #pragma diag_suppress 177 #endif #endif - +//NOLINTBEGIN(readability-non-const-parameter*) {{kernel|generate_definitions(target)}} - +//NOLINTEND(readability-non-const-parameter*) #ifdef __GNUC__ #pragma GCC diagnostic pop #endif @@ -85,6 +83,7 @@ void {{class_name}}::run_impl( uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}} + {{kernel|generate_timestep_advancements|indent(4)}} {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }} {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}} } diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp index d56ec573032eaddba9ba9b959883a864a3f3ce63..0191994f3f3a29ef9384b2a2270294be9df59f43 100644 --- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp +++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp @@ -1,3 +1,22 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.cpp +//! \\author pystencils +//====================================================================================================================== + #include "stencil/Directions.h" #include "core/cell/CellInterval.h" #include "core/DataTypes.h" diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h index d25c04b2b782fe891de361356aa046554d32f1ae..66114de6ee87d58f37d08ef2e39251a2f1060717 100644 --- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h +++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h @@ -1,3 +1,22 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author pystencils +//====================================================================================================================== + #pragma once #include "stencil/Directions.h" #include "core/cell/CellInterval.h" diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp index 054d589ecbc43addfbd20a6009c65d873f56e802..19b7b11ed507f8f068a3deb5908a1ca6fe867711 100644 --- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp +++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp @@ -1,10 +1,22 @@ -#include "core/DataTypes.h" -#include "core/cell/CellInterval.h" +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.cpp +//! \\author pystencils +//====================================================================================================================== -#include "stencil/Directions.h" - -#include "gpu/GPUField.h" -#include "gpu/GPUWrapper.h" #include "{{class_name}}.h" {% if target is equalto 'cpu' -%} diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h index 2b182905cd8584794ba53108f072f1da5abb37bc..b301bced5b8bd159c028e6e75c26fd37df5a63b2 100644 --- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h +++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h @@ -1,4 +1,24 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author pystencils +//====================================================================================================================== + #pragma once + #include "core/DataTypes.h" #include "core/cell/CellInterval.h" @@ -36,9 +56,13 @@ public: {}; virtual ~{{class_name}}() {} - virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream); - virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream); - virtual uint_t size (stencil::Direction dir, IBlock * block); + void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; + void communicateLocal ( stencil::Direction /*dir*/, const IBlock* /* sender */, IBlock* /* receiver */, gpuStream_t /* stream */ ) override + { + WALBERLA_ABORT("Local Communication not implemented yet for standard PackInfos. To run your application turn of local communication in the Communication class") + } + void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; + uint_t size (stencil::Direction dir, IBlock * block) override; private: {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}} diff --git a/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h index 3f9cbb2e659f58eb0b6ae1ff7dcb0e5b1cf0a8e5..860ea49717b76efbe205698a1eb14ed3c0d71797 100644 --- a/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h +++ b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h @@ -1,3 +1,22 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author pystencils +//====================================================================================================================== + #pragma once #include "core/debug/Debug.h" diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp index 10e180fb56a916f79352660fd0bbdd0c3b136c01..8f3e14e59074a2f483fe14c5f85eb3e352c0a836 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp +++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp @@ -14,8 +14,7 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \\file {{class_name}}.cpp -//! \\ingroup lbm -//! \\author lbmpy +//! \\author pystencils //====================================================================================================================== #include <cmath> diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h index 599ade337a0b9ebf08d67d247bc4e0474c2b7ead..e0b773ab1b1ab656a8db81ae10459d01b84766a9 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.h +++ b/python/pystencils_walberla/templates/Sweep.tmpl.h @@ -19,6 +19,7 @@ #pragma once #include "core/DataTypes.h" +#include "core/logging/Logging.h" {% if target is equalto 'cpu' -%} #include "field/GhostLayerField.h" diff --git a/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp b/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a9a1c28434bff3d257ca2bf9c76bd4fa20d9f1db --- /dev/null +++ b/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp @@ -0,0 +1,69 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.cpp +//! \\author pystencils +//====================================================================================================================== +#include "{{class_name}}.h" + +{% if target is equalto 'cpu' -%} +#define FUNC_PREFIX +{%- elif target is equalto 'gpu' -%} +#define FUNC_PREFIX __global__ +{%- endif %} + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning push +#pragma warning( disable : 1599 ) +#endif + +using namespace std; + +namespace walberla { +namespace {{namespace}} { + +{% for kernel in kernels %} +{{kernel['kernel']|generate_definitions(target, kernel['max_threads'])}} +{% endfor %} + + +{% for kernel in kernels %} +void {{class_name}}::{{kernel['function_name']}}( {{kernel['kernel']|generate_plain_parameter_list(ghost_layers=True)}} ) +{ + {{kernel['kernel']|generate_call(ghost_layers_to_include=kernel['ghost_layers_to_include'], stream='stream')|indent(3)}} +} +void {{class_name}}::{{kernel['function_name']}}CellInterval( {{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci')}}) +{ + {{kernel['kernel']|generate_call(stream='stream', cell_interval='ci')|indent(3)}} +} +{% endfor %} + + +} // namespace {{namespace}} +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning pop +#endif diff --git a/python/pystencils_walberla/templates/SweepCollection.tmpl.h b/python/pystencils_walberla/templates/SweepCollection.tmpl.h new file mode 100644 index 0000000000000000000000000000000000000000..5db4ccb33457efcc2f9f9385d0f2b32db35aef5e --- /dev/null +++ b/python/pystencils_walberla/templates/SweepCollection.tmpl.h @@ -0,0 +1,298 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file {{class_name}}.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/logging/Logging.h" +#include "core/Macros.h" + +{% if target is equalto 'gpu' -%} +#include "gpu/GPUField.h" +#include "gpu/ParallelStreams.h" +{%- endif %} + +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "domain_decomposition/StructuredBlockStorage.h" + +#include "field/SwapableCompare.h" +#include "field/GhostLayerField.h" + +#include <set> +#include <cmath> + +{% for header in headers %} +#include {{header}} +{% endfor %} + +using namespace std::placeholders; + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +# pragma GCC diagnostic ignored "-Wreorder" +#endif + +namespace walberla { +namespace {{namespace}} { + + +class {{class_name}} +{ +public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + {{class_name}}(const shared_ptr< StructuredBlockStorage > & blocks, {{kernel_list|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1)) + : blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth) + { + {{kernel_list|generate_constructor(parameter_registration=parameter_scaling) |indent(6)}} + + for (auto& iBlock : *blocks) + { + if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 || + int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 || + int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2) + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") + } + }; + + {{ kernel_list| generate_destructor(class_name) |indent(4) }} + + /************************************************************************************* + * Internal Function Definitions with raw Pointer + *************************************************************************************/ + + {%- for kernel in kernels %} + static void {{kernel['function_name']}} ({{kernel['kernel']|generate_plain_parameter_list(ghost_layers=0, stream="nullptr")}}); + static void {{kernel['function_name']}}CellInterval ({{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci', stream="nullptr")}}); + {% endfor %} + + /************************************************************************************* + * Function Definitions for external Usage + *************************************************************************************/ + + {%- for kernel in kernels %} + + std::function<void (IBlock *)> {{kernel['function_name']}}() + { + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); }; + } + + std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", ] | type_identifier_list -}}) + { + switch (type) + { + case Type::INNER: + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); }; + case Type::OUTER: + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); }; + default: + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); }; + } + } + + std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers"] | type_identifier_list -}}) + { + switch (type) + { + case Type::INNER: + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); }; + case Type::OUTER: + return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); }; + default: + return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); }; + } + } + + {% if target is equalto 'gpu' -%} + std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}}) + { + switch (type) + { + case Type::INNER: + return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); }; + case Type::OUTER: + return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); }; + default: + return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); }; + } + } + + std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "gpuStream_t gpuStream"] | type_identifier_list -}}) + { + switch (type) + { + case Type::INNER: + return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); }; + case Type::OUTER: + return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); }; + default: + return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); }; + } + } + {%- endif %} + + void {{kernel['function_name']}}({{- ["IBlock * block",] | type_identifier_list -}}) + { + const cell_idx_t ghost_layers = 0; + {% if target is equalto 'gpu' -%} + gpuStream_t gpuStream = nullptr; + {%- endif %} + + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements|indent(6)}} + {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}}); + {{kernel['kernel']|generate_swaps|indent(6)}} + } + + void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers"] | type_identifier_list -}}) + { + {% if target is equalto 'gpu' -%} + gpuStream_t gpuStream = nullptr; + {%- endif %} + + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements|indent(6)}} + {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}}); + {{kernel['kernel']|generate_swaps|indent(6)}} + } + + {% if target is equalto 'gpu' -%} + void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}}) + { + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements|indent(6)}} + {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}}); + {{kernel['kernel']|generate_swaps|indent(6)}} + } + {%- endif %} + + void {{kernel['function_name']}}CellInterval({{- ["IBlock * block", "const CellInterval & ci", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements|indent(6)}} + {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}}); + {{kernel['kernel']|generate_swaps|indent(6)}} + } + + void {{kernel['function_name']}}Inner({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements(advance=False)|indent(6)}} + + CellInterval inner = {{kernel['field']}}->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='inner')}}); + } + + void {{kernel['function_name']}}Outer({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}) + { + + {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}} + {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}} + {{kernel['kernel']|generate_timestep_advancements|indent(6)}} + + if( layers_.empty() ) + { + CellInterval ci; + + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + {%if target is equalto 'gpu'%} + { + auto parallelSection_ = parallelStreams_.parallelSection( gpuStream ); + for( auto & ci: layers_ ) + { + parallelSection_.run([&]( auto s ) { + {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}}); + }); + } + } + {% else %} + for( auto & ci: layers_ ) + { + {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}}); + } + {% endif %} + + {{kernel['kernel']|generate_swaps|indent(9)}} + } + {% endfor %} + + {%if target is equalto 'gpu'%} + void setOuterPriority(int priority) + { + parallelStreams_.setStreamPriority(priority); + } + {%endif%} + + private: + shared_ptr< StructuredBlockStorage > blocks_; + {{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}} + + Cell outerWidth_; + std::vector<CellInterval> layers_; + + {%if target is equalto 'gpu' -%} + gpu::ParallelStreams parallelStreams_; + // std::map<BlockID, gpuStream_t > streams_; + {%- endif %} +}; + + +} // namespace {{namespace}} +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py index c109265ef3e5b0f16ff8f9c276394422d096097f..f19a0997497e9659a8c37cb81ba0db85472e7b22 100644 --- a/python/pystencils_walberla/utility.py +++ b/python/pystencils_walberla/utility.py @@ -1,9 +1,17 @@ from os import path -from pystencils.typing import get_base_type -from pystencils_walberla.cmake_integration import CodeGenerationContext +from functools import reduce +from typing import Union, Dict, DefaultDict +import warnings + +from pystencils import CreateKernelConfig, Target +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets +from pystencils.boundaries.createindexlist import boundary_index_array_coordinate_names, direction_member_name +from pystencils.typing import BasicType, create_type, get_base_type from lbmpy import LBStencil +from pystencils_walberla.cmake_integration import CodeGenerationContext + HEADER_EXTENSIONS = {'.h', '.hpp'} @@ -59,6 +67,145 @@ def generate_info_header(ctx: CodeGenerationContext, ctx.write_file(filename, lines + additional_code) +def get_vectorize_instruction_set(ctx: CodeGenerationContext): + """returns a list of supported vector instruction sets. If waLBerla is not build with + `WALBERLA_OPTIMIZE_FOR_LOCALHOST` `None` is returned. + + Args: + ctx: Code Generation Context + """ + + if ctx.optimize_for_localhost: + supported_instruction_sets = get_supported_instruction_sets() + if supported_instruction_sets: + return supported_instruction_sets[-1] + else: # if cpuinfo package is not installed + warnings.warn("Could not obtain supported vectorization instruction sets - defaulting to sse. " + "This problem can probably be fixed by installing py-cpuinfo. This package can " + "gather the needed hardware information.") + return 'sse' + else: + return None + + +def config_from_context(ctx: CodeGenerationContext, target: Target = Target.CPU, + data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = None, + cpu_openmp: Union[bool, int] = None, cpu_vectorize_info: Dict = None, + **kwargs) -> CreateKernelConfig: + """Creates a :class: `pystencils.config.CreateKernelConfig` from the code generation context. By default, + all arguments are determined by the generation context. This means for example if `DWALBERLA_BUILD_WITH_GPU_SUPPORT` is + `True` the kernel will be generated for GPU using either CUDA or HIP. + + Args: + ctx: Code Generation Context + target: All targets are defined in :class:`pystencils.enums.Target` + data_type: Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to + type. If specified as a dict ideally a defaultdict is used to define a default value for symbols + not listed in the dict. If a plain dict is provided it will be transformed into a defaultdict + internally. The default value will then be specified via type collation then. + cpu_openmp: `True` or number of threads for OpenMP parallelization, `False` for no OpenMP. + If set to `True`, the maximum number of available threads will be chosen. + cpu_vectorize_info: A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal' + for documentation of these parameters see vectorize function. Example: + '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}' + kwargs: keyword arguments that can be taken by :class: `pystencils.config.CreateKernelConfig` + """ + + if target == Target.GPU and not ctx.gpu: + raise ValueError("can not generate gpu code if waLBerla is not build with GPU support. Please use " + "-DWALBERLA_BUILD_WITH_CUDA=1 or -DWALBERLA_BUILD_WITH_HIP=1 for configuring cmake") + + default_dtype = "float64" if ctx.double_accuracy else "float32" + if data_type is None: + data_type = default_dtype + + if cpu_openmp and not ctx.openmp: + warnings.warn("Code is generated with OpenMP pragmas but waLBerla is not build with OpenMP. " + "The compilation might not work due to wrong compiler flags. " + "Please use -DWALBERLA_BUILD_WITH_OPENMP=1 for configuring cmake") + + if cpu_openmp is None: + cpu_openmp = ctx.openmp + + if cpu_vectorize_info is None: + cpu_vectorize_info = {} + + default_vec_is = get_vectorize_instruction_set(ctx) + + cpu_vectorize_info['instruction_set'] = cpu_vectorize_info.get('instruction_set', default_vec_is) + cpu_vectorize_info['assume_inner_stride_one'] = cpu_vectorize_info.get('assume_inner_stride_one', True) + cpu_vectorize_info['assume_aligned'] = cpu_vectorize_info.get('assume_aligned', False) + cpu_vectorize_info['nontemporal'] = cpu_vectorize_info.get('nontemporal', False) + cpu_vectorize_info['assume_sufficient_line_padding'] = cpu_vectorize_info.get('assume_sufficient_line_padding', + False) + + config = CreateKernelConfig(target=target, data_type=data_type, default_number_float=data_type, + cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, + **kwargs) + + return config + + +def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None): + if identity_check_key is None: + identity_check_key = sort_key + nx = len(lx) + ny = len(ly) + + def recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern): + if ix_intern == nx: + return ly_intern[iy_intern:] + if iy_intern == ny: + return lx_intern[ix_intern:] + x = lx_intern[ix_intern] + y = ly_intern[iy_intern] + skx = sort_key(x) + sky = sort_key(y) + if skx == sky: + if identity_check_key(x) == identity_check_key(y): + return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern + 1) + else: + raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!') + elif skx < sky: + return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern) + else: + return [y] + recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern + 1) + return recursive_merge(lx, ly, 0, 0) + + +def merge_lists_of_symbols(lists): + def merger(lx, ly): + return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol) + return reduce(merger, lists) + + +def struct_from_numpy_dtype(struct_name, numpy_dtype): + result = f"struct {struct_name} {{ \n" + + equality_compare = [] + constructor_params = [] + constructor_initializer_list = [] + for name, (sub_type, offset) in numpy_dtype.fields.items(): + pystencils_type = create_type(sub_type) + result += f" {pystencils_type} {name};\n" + if name in boundary_index_array_coordinate_names or name == direction_member_name: + constructor_params.append(f"{pystencils_type} {name}_") + constructor_initializer_list.append(f"{name}({name}_)") + else: + constructor_initializer_list.append(f"{name}()") + if pystencils_type.is_float(): + equality_compare.append(f"floatIsEqual({name}, o.{name})") + else: + equality_compare.append(f"{name} == o.{name}") + + result += " %s(%s) : %s {}\n" % \ + (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list)) + result += " bool operator==(const %s & o) const {\n return %s;\n }\n" % \ + (struct_name, " && ".join(equality_compare)) + result += "};\n" + return result + + # ------------------------------------- INTERNAL ------------------------------------------------------------- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 891a92988bbd994f3d69c4deacac7ff08ce46362..92b465e32b32f8ec8396f8b6fb08767daadfa146 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -37,6 +37,7 @@ add_subdirectory( gather ) add_subdirectory( geometry ) add_subdirectory( gui ) add_subdirectory( lbm ) +add_subdirectory( lbm_generated ) add_subdirectory( lbm_mesapd_coupling ) add_subdirectory( mesa_pd ) if( OPENMESH_FOUND ) diff --git a/src/blockforest/Block.h b/src/blockforest/Block.h index 64c7dafa70efecb428807de242ddf165e4417023..a61de6ac5c898c0ea1f8bb6a28f0a7b7f33fe002 100644 --- a/src/blockforest/Block.h +++ b/src/blockforest/Block.h @@ -270,21 +270,21 @@ inline bool Block::neighborhoodSectionHasSmallerBlocks( const uint_t sectionInde { WALBERLA_ASSERT_LESS( sectionIndex, 26 ); - return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() > id_.getUsedBits(); + return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() > id_.getUsedBits(); } inline bool Block::neighborhoodSectionHasEquallySizedBlock( const uint_t sectionIndex ) const { WALBERLA_ASSERT_LESS( sectionIndex, 26 ); - return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() == id_.getUsedBits(); + return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() == id_.getUsedBits(); } inline bool Block::neighborhoodSectionHasLargerBlock( const uint_t sectionIndex ) const { WALBERLA_ASSERT_LESS( sectionIndex, 26 ); - return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() < id_.getUsedBits(); + return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() < id_.getUsedBits(); } diff --git a/src/blockforest/BlockDataHandling.h b/src/blockforest/BlockDataHandling.h index 7f56467c06b9eebf033753847020843a14a264c3..71e0138be0dd8c3b5af16769c32ffc3ac6e9f386 100644 --- a/src/blockforest/BlockDataHandling.h +++ b/src/blockforest/BlockDataHandling.h @@ -122,65 +122,65 @@ public: BlockData * initialize( IBlock * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->initialize( block ); return ptr ? new BlockData( ptr ) : nullptr; } void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->serialize( block, id, buffer ); } void serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->serializeCoarseToFine( block, id, buffer, child ); } void serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->serializeFineToCoarse( block, id, buffer ); } BlockData * deserialize( IBlock * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->deserialize( block ); return ptr ? new BlockData( ptr ) : nullptr; } BlockData * deserializeCoarseToFine( Block * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->deserializeCoarseToFine( block ); return ptr ? new BlockData( ptr ) : nullptr; } BlockData * deserializeFineToCoarse( Block * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->deserializeFineToCoarse( block ); return ptr ? new BlockData( ptr ) : nullptr; } void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->deserialize( block, id, buffer ); } void deserializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->deserializeCoarseToFine( block, id, buffer ); } void deserializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer, const uint_t child ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->deserializeFineToCoarse( block, id, buffer, child ); } diff --git a/src/blockforest/communication/NonUniformBufferedScheme.h b/src/blockforest/communication/NonUniformBufferedScheme.h index caf91651c578ddc7da5bea5b8a67398e8cc590ee..be27a51ec805285144983d2d3a3618c502596d50 100644 --- a/src/blockforest/communication/NonUniformBufferedScheme.h +++ b/src/blockforest/communication/NonUniformBufferedScheme.h @@ -65,10 +65,10 @@ public: //**Construction & Destruction*************************************************************************************** /*! \name Construction & Destruction */ //@{ - explicit NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf, + explicit NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 - NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf, + NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 @@ -96,6 +96,16 @@ public: inline void communicateEqualLevel ( const uint_t level ); inline void communicateCoarseToFine( const uint_t fineLevel ); inline void communicateFineToCoarse( const uint_t fineLevel ); + + std::function<void()> communicateEqualLevelFunctor(const uint_t level) { + return [level, this](){ NonUniformBufferedScheme::communicateEqualLevel(level);}; + } + std::function<void()> communicateCoarseToFineFunctor(const uint_t fineLevel) { + return [fineLevel, this](){ NonUniformBufferedScheme::communicateCoarseToFine(fineLevel);}; + } + std::function<void()> communicateFineToCoarseFunctor(const uint_t fineLevel) { + return [fineLevel, this](){ NonUniformBufferedScheme::communicateFineToCoarse(fineLevel);}; + } //@} //******************************************************************************************************************* @@ -190,7 +200,7 @@ protected: template< typename Stencil > -NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf, const int baseTag ) +NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const int baseTag ) : blockForest_( bf ), localMode_( START ), baseTag_( baseTag ), requiredBlockSelectors_( Set<SUID>::emptySet() ), incompatibleBlockSelectors_( Set<SUID>::emptySet() ) { @@ -200,7 +210,7 @@ NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<Structured template< typename Stencil > -NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf, +NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, const int baseTag /*= 778*/ ) // waLBerla = 119+97+76+66+101+114+108+97 @@ -236,10 +246,10 @@ void NonUniformBufferedScheme<Stencil>::init() template< typename Stencil > void NonUniformBufferedScheme<Stencil>::refresh() { - WALBERLA_ASSERT( !isAnyCommunicationInProgress() ); + WALBERLA_ASSERT( !isAnyCommunicationInProgress() ) auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levels = forest->getNumberOfLevels(); for( uint_t i = 0; i != 3; ++i ) @@ -296,7 +306,7 @@ inline void NonUniformBufferedScheme<Stencil>::addPackInfo( const PackInfo & pac { if( isAnyCommunicationInProgress() ) { - WALBERLA_ABORT( "You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!" ); + WALBERLA_ABORT( "You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!" ) } packInfos_.push_back( packInfo ); @@ -381,7 +391,7 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateEqualLevel() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) @@ -400,7 +410,7 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); if( levelIndex == 1 ) @@ -421,7 +431,7 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); if( levelIndex == 1 ) @@ -442,8 +452,8 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateEqualLevel( const uint_t level ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() ) if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) refresh(); @@ -460,9 +470,9 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine( const uint_t fineLevel ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ); - WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ) + WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ) if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) refresh(); @@ -479,9 +489,9 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse( const uint_t fineLevel ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ); - WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ) + WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ) if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) refresh(); @@ -498,10 +508,10 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateEqualLevel() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); - WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) wait( EQUAL_LEVEL, levelIndex ); } @@ -512,10 +522,10 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateCoarseToFine() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); - WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) if( levelIndex == 1 ) return; @@ -529,10 +539,10 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateFineToCoarse() { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) const uint_t levelIndex = forest->getNumberOfLevels(); - WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) if( levelIndex == 1 ) return; @@ -546,10 +556,10 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateEqualLevel ( const uint_t level ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() ); - WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() ) + WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) wait( EQUAL_LEVEL, level ); } @@ -560,11 +570,11 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateCoarseToFine( const uint_t fineLevel ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ); - WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ); - WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ) + WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ) + WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) wait( COARSE_TO_FINE, fineLevel ); } @@ -575,11 +585,11 @@ template< typename Stencil > inline void NonUniformBufferedScheme<Stencil>::waitCommunicateFineToCoarse( const uint_t fineLevel ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); - WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ); - WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ); - WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) ); - WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) + WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) ) + WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() ) + WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) ) + WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ) wait( FINE_TO_COARSE, fineLevel ); } @@ -619,7 +629,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint std::map< uint_t, std::vector< SendBufferFunction > > sendFunctions; auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) for( auto it = forest->begin(); it != forest->end(); ++it ) { @@ -638,7 +648,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint if( !( block->neighborhoodSectionHasEquallySizedBlock(neighborIdx) ) ) continue; - WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ); + WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ) const BlockID & receiverId = block->getNeighborId( neighborIdx, uint_t(0) ); @@ -648,13 +658,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); - WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ); + WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) { if( localMode_ == BUFFER ) { - SendBuffer buffer; + SendBuffer const buffer; localBuffers.push_back( buffer ); const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); @@ -745,7 +755,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui std::set< uint_t > ranksToReceiveFrom; auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) for( auto it = forest->begin(); it != forest->end(); ++it ) { Block * block = dynamic_cast< Block * >( it.get() ); @@ -774,13 +784,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui if( block->neighborExistsLocally( neighborIdx, n ) ) { auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); - WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ); + WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) { if( localMode_ == BUFFER ) { - SendBuffer buffer; + SendBuffer const buffer; localBuffers.push_back( buffer ); const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); @@ -829,7 +839,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); if( block->neighborhoodSectionHasLargerBlock(neighborIdx) ) { - WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ); + WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ) if( block->neighborExistsRemotely( neighborIdx, uint_t(0) ) && selectable::isSetSelected( block->getNeighborState( neighborIdx, 0 ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) { @@ -890,7 +900,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui std::set< uint_t > ranksToReceiveFrom; auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) for( auto it = forest->begin(); it != forest->end(); ++it ) { @@ -910,7 +920,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui if( !( block->neighborhoodSectionHasLargerBlock(neighborIdx) ) ) continue; - WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ); + WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) ) const BlockID & receiverId = block->getNeighborId( neighborIdx, uint_t(0) ); @@ -920,13 +930,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); - WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ); + WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) { if( localMode_ == BUFFER ) { - SendBuffer buffer; + SendBuffer const buffer; localBuffers.push_back( buffer ); const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); @@ -1144,7 +1154,7 @@ template< typename Stencil > void NonUniformBufferedScheme<Stencil>::receive( RecvBuffer & buffer ) { auto forest = blockForest_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) while( !buffer.isEmpty() ) { @@ -1183,7 +1193,7 @@ template< typename Stencil > void NonUniformBufferedScheme<Stencil>::localBufferPacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo, const Block * sender, const Block * receiver, const stencil::Direction & dir ) { - WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() ); + WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() ) SendBuffer & buffer = localBuffers_[i][j][ bufferIndex ]; buffer.clear(); @@ -1198,7 +1208,7 @@ void NonUniformBufferedScheme<Stencil>::localBufferPacking( const INDEX i, const } else { - WALBERLA_ASSERT( i == FINE_TO_COARSE ); + WALBERLA_ASSERT( i == FINE_TO_COARSE ) packInfo->packDataFineToCoarse( sender, receiver->getId(), dir, buffer ); } } @@ -1209,7 +1219,7 @@ template< typename Stencil > void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo, Block * receiver, const Block * sender, const stencil::Direction & dir ) { - WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() ); + WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() ) SendBuffer & sendBuffer = localBuffers_[i][j][ bufferIndex ]; RecvBuffer recvBuffer( sendBuffer ); @@ -1224,7 +1234,7 @@ void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, con } else { - WALBERLA_ASSERT( i == FINE_TO_COARSE ); + WALBERLA_ASSERT( i == FINE_TO_COARSE ) packInfo->unpackDataFineToCoarse( receiver, sender->getId(), stencil::inverseDir[dir], recvBuffer ); } } diff --git a/src/blockforest/communication/NonUniformPackInfo.h b/src/blockforest/communication/NonUniformPackInfo.h index 0b32369c654e4ca9642d88f5d85763f880b7e55d..73c3f760fbfb54b3af1be35fdd2d633e3495269e 100644 --- a/src/blockforest/communication/NonUniformPackInfo.h +++ b/src/blockforest/communication/NonUniformPackInfo.h @@ -106,13 +106,13 @@ protected: inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const { #ifndef NDEBUG - size_t sizeBefore = buffer.size(); + size_t const sizeBefore = buffer.size(); #endif packDataEqualLevelImpl( sender, dir, buffer ); #ifndef NDEBUG - size_t sizeAfter = buffer.size(); + size_t const sizeAfter = buffer.size(); if( constantDataExchange() ) { #ifdef _OPENMP @@ -125,7 +125,7 @@ inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stenci if( dirEntry == sizeMap.end() ) sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore; else - WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ); + WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ) #ifdef _OPENMP } #endif @@ -138,13 +138,13 @@ inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stenci inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const { #ifndef NDEBUG - size_t sizeBefore = buffer.size(); + size_t const sizeBefore = buffer.size(); #endif packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer ); #ifndef NDEBUG - size_t sizeAfter = buffer.size(); + size_t const sizeAfter = buffer.size(); if( constantDataExchange() ) { #ifdef _OPENMP @@ -157,7 +157,7 @@ inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender if( dirEntry == sizeMap.end() ) sizeMap[ fineReceiver.getBranchId() ] = sizeAfter - sizeBefore; else - WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) ); + WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) ) #ifdef _OPENMP } #endif @@ -170,13 +170,13 @@ inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender inline void NonUniformPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const { #ifndef NDEBUG - size_t sizeBefore = buffer.size(); + size_t const sizeBefore = buffer.size(); #endif packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer ); #ifndef NDEBUG - size_t sizeAfter = buffer.size(); + size_t const sizeAfter = buffer.size(); if( constantDataExchange() ) { #ifdef _OPENMP @@ -189,7 +189,7 @@ inline void NonUniformPackInfo::packDataFineToCoarse( const Block * fineSender, if( dirEntry == sizeMap.end() ) sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore; else - WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ); + WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ) #ifdef _OPENMP } #endif diff --git a/src/blockforest/communication/UniformBufferedScheme.h b/src/blockforest/communication/UniformBufferedScheme.h index 8677b5f83afe7c4ba8bb047fbd95724bc7ce3ac3..7bc813cc5067a8d4336bc376dc643314defa1816 100644 --- a/src/blockforest/communication/UniformBufferedScheme.h +++ b/src/blockforest/communication/UniformBufferedScheme.h @@ -314,7 +314,7 @@ void UniformBufferedScheme<Stencil>::startCommunication() { if( localMode_ == BUFFER ) { - SendBuffer buffer; + SendBuffer const buffer; localBuffers_.push_back( buffer ); const uint_t index = uint_c( localBuffers_.size() ) - uint_t(1); diff --git a/src/communication/UniformPackInfo.h b/src/communication/UniformPackInfo.h index 5ec6db29d32dff36713ab903498048b450f748f2..aa110f9bdf5c51b37a57572cfbc800b004ab37b6 100644 --- a/src/communication/UniformPackInfo.h +++ b/src/communication/UniformPackInfo.h @@ -153,13 +153,13 @@ protected: inline void UniformPackInfo::packData( const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const { #ifndef NDEBUG - size_t sizeBefore = buffer.size(); + size_t const sizeBefore = buffer.size(); #endif packDataImpl( sender, dir, buffer ); #ifndef NDEBUG - size_t sizeAfter = buffer.size(); + size_t const sizeAfter = buffer.size(); if( constantDataExchange() ) { #ifdef _OPENMP @@ -171,7 +171,7 @@ inline void UniformPackInfo::packData( const IBlock * sender, stencil::Direction if( dirEntry == blockMap.end() ) blockMap[ dir ] = sizeAfter - sizeBefore; else - WALBERLA_ASSERT_EQUAL( blockMap[ dir ], (sizeAfter - sizeBefore) ); + WALBERLA_ASSERT_EQUAL( blockMap[ dir ], (sizeAfter - sizeBefore) ) #ifdef _OPENMP } #endif diff --git a/src/core/cell/Cell.h b/src/core/cell/Cell.h index 8f41297b78a1ff66d4cc7a9f39f98692d11d1b49..f531430ea1733cca2ccf650a944bd92301d623cd 100644 --- a/src/core/cell/Cell.h +++ b/src/core/cell/Cell.h @@ -50,9 +50,10 @@ public: //@{ Cell() = default; inline Cell( const cell_idx_t _x, const cell_idx_t _y, const cell_idx_t _z ) { cell[0] = _x; cell[1] = _y; cell[2] = _z; } - //inline Cell( const int _x, const int _y, const int _z ); + inline Cell( const Vector3<cell_idx_t> _vec ) {cell[0] = _vec[0]; cell[1] = _vec[1]; cell[2] = _vec[2];} + inline Cell( const uint_t _x, const uint_t _y, const uint_t _z ); - inline Cell( const Vector3<cell_idx_t>& vec ){ cell[0] = vec[0]; cell[1] = vec[1]; cell[2] = vec[2]; }; + inline Cell( const Vector3<uint_t> _vec ); //@} /*! \name Arithmetic operators */ @@ -102,15 +103,6 @@ std::ostream & operator<<( std::ostream & os, const Cell & cell ); std::istream & operator>>( std::istream & is, Cell & cell ); //@} - - -// inline Cell::Cell( const int _x, const int _y, const int _z ) { -// -// x() = cell_idx_c( _x ); y() = cell_idx_c( _y ); z() = cell_idx_c( _z ); -// } - - - inline Cell::Cell( const uint_t _x, const uint_t _y, const uint_t _z ) { cell[0] = cell_idx_c( _x ); @@ -118,6 +110,12 @@ inline Cell::Cell( const uint_t _x, const uint_t _y, const uint_t _z ) cell[2] = cell_idx_c( _z ); } +inline Cell::Cell( const Vector3<uint_t> _vec ) +{ + cell[0] = cell_idx_c( _vec[0] ); + cell[1] = cell_idx_c( _vec[1] ); + cell[2] = cell_idx_c( _vec[2] ); +} /*******************************************************************************************************************//** diff --git a/src/core/mpi/BufferSystem.h b/src/core/mpi/BufferSystem.h index 6a531fa907e6f95ceb09d29b6ce31ecb9cbecf2e..04161810a408aab7bc0baf6ddb6159714333494a 100644 --- a/src/core/mpi/BufferSystem.h +++ b/src/core/mpi/BufferSystem.h @@ -151,7 +151,7 @@ public: void sendAll(); void send( MPIRank rank ); - iterator begin() { WALBERLA_ASSERT( communicationRunning_); return iterator( *this, true ); } + iterator begin() { WALBERLA_ASSERT( communicationRunning_) return iterator( *this, true ); } iterator end() { return iterator( *this, false); } //@} //******************************************************************************************************************* @@ -190,7 +190,7 @@ public: //@{ bool isSizeCommunicatedInNextStep() const { return (currentComm_ == &unknownSizeComm_); } bool isCommunicationRunning() const { return communicationRunning_; } - bool isReceiverInformationSet() const { return currentComm_ != NULL; } + bool isReceiverInformationSet() const { return currentComm_ != nullptr; } //@} //******************************************************************************************************************* diff --git a/src/core/mpi/BufferSystem.impl.h b/src/core/mpi/BufferSystem.impl.h index 4cbd884ba791bb4d07e3591dc9a6a0101ad657ff..183d29bd86b7412090916e0b2be92c3dc7a4c352 100644 --- a/src/core/mpi/BufferSystem.impl.h +++ b/src/core/mpi/BufferSystem.impl.h @@ -231,14 +231,14 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::set<MPIRank> & ran template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISize> & ranksToRecvFrom ) { - WALBERLA_ASSERT( ! communicationRunning_ ); + WALBERLA_ASSERT( ! communicationRunning_ ) recvInfos_.clear(); for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it ) { const MPIRank sender = it->first; const MPISize senderSize = it->second; - WALBERLA_ASSERT_GREATER( senderSize, 0 ); + WALBERLA_ASSERT_GREATER( senderSize, 0 ) recvInfos_[ sender ].size = senderSize; } diff --git a/src/core/mpi/BufferSystemHelper.h b/src/core/mpi/BufferSystemHelper.h index 5603db56005c53eff6c7a5c5f2cbd369969b0c90..c505dfa0bd74f939500c1962458b4c1e2c9354fe 100644 --- a/src/core/mpi/BufferSystemHelper.h +++ b/src/core/mpi/BufferSystemHelper.h @@ -108,7 +108,7 @@ namespace internal { using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo; KnownSizeCommunication( const MPI_Comm & communicator, int tag = 0 ) - : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {} + : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){} ~KnownSizeCommunication() override = default; @@ -121,8 +121,8 @@ namespace internal { MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override; private: - bool sending_; - bool receiving_; + bool sending_{false}; + bool receiving_{false}; std::vector<MPI_Request> sendRequests_; std::vector<MPI_Request> recvRequests_; @@ -136,7 +136,7 @@ namespace internal { using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo; UnknownSizeCommunication( const MPI_Comm & communicator, int tag = 0 ) - : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {} + : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){} ~UnknownSizeCommunication() override = default; @@ -149,8 +149,8 @@ namespace internal { MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override; private: - bool sending_; - bool receiving_; + bool sending_{false}; + bool receiving_{false}; std::vector<MPI_Request> sendRequests_; std::list<MPISize> outgoingBufferForSizes_; @@ -168,7 +168,7 @@ namespace internal { using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo; UnknownSizeCommunicationIProbe( const MPI_Comm & communicator, int tag = 0 ) - : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {} + : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){} ~UnknownSizeCommunicationIProbe() override = default; @@ -181,8 +181,8 @@ namespace internal { MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override; private: - bool sending_; - bool receiving_; + bool sending_{false}; + bool receiving_{false}; int pendingReceives_; std::vector<MPI_Request> sendRequests_; @@ -196,7 +196,7 @@ namespace internal { using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo; NoMPICommunication( const MPI_Comm & communicator, int tag = 0 ) - : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), received_( false ) {} + : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){} ~NoMPICommunication() override = default; @@ -209,7 +209,7 @@ namespace internal { MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override; private: - bool received_; + bool received_{ false }; RecvBuffer_T tmpBuffer_; }; diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h index 89568b4f5ca8c509168c2d24ff99984fe42babee..32c1e7f300be9455c3be537050d12f04c50aa7d1 100644 --- a/src/core/timing/Timer.h +++ b/src/core/timing/Timer.h @@ -500,7 +500,7 @@ shared_ptr<Timer<TP> > getReduced( Timer<TP>& timer, ReduceType rt, int targetRa break; default: - WALBERLA_ABORT( "Unknown reduce type" ); + WALBERLA_ABORT( "Unknown reduce type" ) break; } diff --git a/src/core/timing/TimingPool.cpp b/src/core/timing/TimingPool.cpp index dff973201aa5e3976576c60038200cba4492fd6b..7539fffe3610c4fb5e7cb0846b5192f5e7887e70 100644 --- a/src/core/timing/TimingPool.cpp +++ b/src/core/timing/TimingPool.cpp @@ -116,7 +116,7 @@ shared_ptr<TimingPool<TP> > TimingPool<TP>::getReduced( ReduceType rt, int targe break; default: - WALBERLA_ABORT( "Unknown reduce type" ); + WALBERLA_ABORT( "Unknown reduce type" ) break; } diff --git a/src/domain_decomposition/BlockDataHandling.h b/src/domain_decomposition/BlockDataHandling.h index 0720eb572ffadafc2deb78b3733e4cf5ff225029..56b18521f0c65656b3b09b4ec5ff0a430c39c312 100644 --- a/src/domain_decomposition/BlockDataHandling.h +++ b/src/domain_decomposition/BlockDataHandling.h @@ -92,21 +92,21 @@ public: void serialize( IBlock * const, const BlockDataID &, mpi::SendBuffer & ) override { - WALBERLA_ABORT( "You are trying to serialize a block data item for which only an initialization function was registered" ); + WALBERLA_ABORT( "You are trying to serialize a block data item for which only an initialization function was registered" ) #ifdef __IBMCPP__ - return NULL; // never reached, helps to suppress a warning from the IBM compiler + return nullptr; // never reached, helps to suppress a warning from the IBM compiler #endif } T * deserialize( IBlock * const ) override { - WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" ); + WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" ) #ifdef __IBMCPP__ - return NULL; // never reached, helps to suppress a warning from the IBM compiler + return nullptr; // never reached, helps to suppress a warning from the IBM compiler #endif } void deserialize( IBlock * const, const BlockDataID &, mpi::RecvBuffer & ) override { - WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" ); + WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" ) } private: @@ -175,27 +175,27 @@ public: BlockData * initialize( IBlock * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->initialize( block ); return ptr ? new BlockData( ptr ) : nullptr; } void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->serialize( block, id, buffer ); } BlockData * deserialize( IBlock * const block ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) T * ptr = dataHandling_->deserialize( block ); return ptr ? new BlockData( ptr ) : nullptr; } void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override { - WALBERLA_ASSERT_NOT_NULLPTR( block ); + WALBERLA_ASSERT_NOT_NULLPTR( block ) dataHandling_->deserialize( block, id, buffer ); } @@ -269,7 +269,7 @@ public: " - block state: " << block->getState() << "\n" " - global state: " << uid::globalState() << "\n" " - additional state: " << state << "\n" - " - \"selector\": " << selection ); + " - \"selector\": " << selection ) } return dataHandling; diff --git a/src/domain_decomposition/IBlock.h b/src/domain_decomposition/IBlock.h index 06e48b6905194a71ed3970550ca65ee525876080..ef563cc057b2e9157330420f09beb1ad74050168 100644 --- a/src/domain_decomposition/IBlock.h +++ b/src/domain_decomposition/IBlock.h @@ -110,7 +110,7 @@ public: WALBERLA_ABORT( "BlockData access type violation! (The block data you added is of a different type than the block data you are trying to access!)" ) #endif #ifdef __IBMCPP__ - return NULL; // never reached, helps to suppress a warning from the IBM compiler + return nullptr; // never reached, helps to suppress a warning from the IBM compiler #endif } @@ -212,8 +212,6 @@ public: friend class BlockStorage; friend class StructuredBlockStorage; -public: - virtual const IBlockID& getId() const = 0; bool operator==( const IBlock& rhs ) const; @@ -466,7 +464,7 @@ inline const T* IBlock::uncheckedFastGetData( const ConstBlockDataID & index ) c WALBERLA_ASSERT_LESS( uint_t( index ), data_.size() ); if( data_[index] == nullptr ) - return NULL; + return nullptr; return data_[index]->template uncheckedFastGet< T >(); } diff --git a/src/domain_decomposition/StructuredBlockStorage.h b/src/domain_decomposition/StructuredBlockStorage.h index 574634255f49d6f969c82a108c98f0dde7582132..146a5eadb5186fb283652c834d4579ca91482e73 100644 --- a/src/domain_decomposition/StructuredBlockStorage.h +++ b/src/domain_decomposition/StructuredBlockStorage.h @@ -284,9 +284,9 @@ public: - real_t dx( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dx_.size() ); return dx_[ level ]; } ///< cell size on level "level" in x direction - real_t dy( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dy_.size() ); return dy_[ level ]; } ///< cell size on level "level" in y direction - real_t dz( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dz_.size() ); return dz_[ level ]; } ///< cell size on level "level" in z direction + real_t dx( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dx_.size() ) return dx_[ level ]; } ///< cell size on level "level" in x direction + real_t dy( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dy_.size() ) return dy_[ level ]; } ///< cell size on level "level" in y direction + real_t dz( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dz_.size() ) return dz_[ level ]; } ///< cell size on level "level" in z direction void mapToPeriodicDomain( Cell& cell, const uint_t level = 0 ) const; // -> for documentation of this function see StructuredBlockStorage.cpp @@ -354,7 +354,7 @@ public: /// Returns the block data ID required for accessing the cell bounding box of blocks - fails in debug mode if no block cell bounding boxes /// have been created via "createCellBoundingBoxes()". (remember: every block resides on exactly one grid level, and all blocks managed by a // structured block storage are assigned a corresponding cell bounding box as block data once "createCellBoundingBoxes()" is called.) - inline ConstBlockDataID getBlockCellBBId() const { WALBERLA_ASSERT( blockCellBBCreated_ ); return blockCellBBId_; } + inline ConstBlockDataID getBlockCellBBId() const { WALBERLA_ASSERT( blockCellBBCreated_ ) return blockCellBBId_; } inline const CellInterval& getBlockCellBB( const IBlock& block ) const; @@ -488,7 +488,7 @@ inline bool StructuredBlockStorage::operator==( const StructuredBlockStorage& rh inline const CellInterval& StructuredBlockStorage::getDomainCellBB( const uint_t level ) const { - WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ); + WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ) return domainCellBB_[ level ]; } @@ -497,7 +497,7 @@ inline const CellInterval& StructuredBlockStorage::getDomainCellBB( const uint_t inline uint_t StructuredBlockStorage::getNumberOfXCells( const uint_t level ) const { - WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ); + WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ) return uint_c( domainCellBB_[ level ].xMax() + 1 ); } @@ -506,7 +506,7 @@ inline uint_t StructuredBlockStorage::getNumberOfXCells( const uint_t level ) co inline uint_t StructuredBlockStorage::getNumberOfYCells( const uint_t level ) const { - WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ); + WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ) return uint_c( domainCellBB_[ level ].yMax() + 1 ); } @@ -515,7 +515,7 @@ inline uint_t StructuredBlockStorage::getNumberOfYCells( const uint_t level ) co inline uint_t StructuredBlockStorage::getNumberOfZCells( const uint_t level ) const { - WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ); + WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ) return uint_c( domainCellBB_[ level ].zMax() + 1 ); } @@ -524,8 +524,8 @@ inline uint_t StructuredBlockStorage::getNumberOfZCells( const uint_t level ) co inline uint_t StructuredBlockStorage::getNumberOfCells( const uint_t index, const uint_t level ) const { - WALBERLA_ASSERT_LESS( index, uint_t(3) ); - WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ); + WALBERLA_ASSERT_LESS( index, uint_t(3) ) + WALBERLA_ASSERT_LESS( level, domainCellBB_.size() ) return uint_c( domainCellBB_[ level ].max()[ index ] + 1 ); } @@ -689,7 +689,9 @@ inline AABB StructuredBlockStorage::getAABBFromCellBB( const CellInterval& cellB //********************************************************************************************************************** inline const IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t level ) const { - real_t x, y, z; + real_t x; + real_t y; + real_t z; getCellCenter( x, y, z, cell, level ); const IBlock* block = blockStorage_->getBlock(x,y,z); @@ -712,7 +714,9 @@ inline const IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const u //********************************************************************************************************************** inline IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t level ) { - real_t x, y, z; + real_t x; + real_t y; + real_t z; getCellCenter( x, y, z, cell, level ); IBlock* block = blockStorage_->getBlock(x,y,z); @@ -736,8 +740,8 @@ inline IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t //********************************************************************************************************************** inline const CellInterval& StructuredBlockStorage::getBlockCellBB( const IBlock& block ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); - WALBERLA_ASSERT( blockCellBBCreated_ ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) + WALBERLA_ASSERT( blockCellBBCreated_ ) return *(block.uncheckedFastGetData< CellInterval >( blockCellBBId_ )); } @@ -769,12 +773,12 @@ inline Cell StructuredBlockStorage::getBlockLocalCell( const IBlock& block, cons //********************************************************************************************************************** inline void StructuredBlockStorage::getBlockLocalCell( Cell& localCell, const IBlock& block, const real_t x, const real_t y, const real_t z ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const AABB & aabb = block.getAABB(); const uint_t level = getLevel( block ); - WALBERLA_ASSERT_LESS( level, levels_ ); + WALBERLA_ASSERT_LESS( level, levels_ ) localCell.x() = cell_idx_c( std::floor( ( x - aabb.xMin() ) / dx( level ) ) ); localCell.y() = cell_idx_c( std::floor( ( y - aabb.yMin() ) / dy( level ) ) ); @@ -805,12 +809,12 @@ inline Vector3< real_t > StructuredBlockStorage::getBlockLocalCellCenter( const //********************************************************************************************************************** inline void StructuredBlockStorage::getBlockLocalCellCenter( const IBlock & block, const Cell & localCell, real_t & x, real_t & y, real_t & z ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const AABB & aabb = block.getAABB(); const uint_t level = getLevel( block ); - WALBERLA_ASSERT_LESS( level, levels_ ); + WALBERLA_ASSERT_LESS( level, levels_ ) x = aabb.xMin() + ( real_c( localCell.x() ) + real_c(0.5) ) * dx( level ); y = aabb.yMin() + ( real_c( localCell.y() ) + real_c(0.5) ) * dy( level ); @@ -842,12 +846,12 @@ inline AABB StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block, //********************************************************************************************************************** inline void StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block, const Cell & localCell, AABB & aabb ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const AABB& blockAABB = block.getAABB(); const uint_t level = getLevel( block ); - WALBERLA_ASSERT_LESS( level, levels_ ); + WALBERLA_ASSERT_LESS( level, levels_ ) const real_t x = blockAABB.xMin() + real_c( localCell.x() ) * dx( level ); const real_t y = blockAABB.yMin() + real_c( localCell.y() ) * dy( level ); @@ -866,7 +870,7 @@ inline void StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block, //********************************************************************************************************************** inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> & local, const IBlock& block, const Vector3<real_t> & global ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const uint_t level = getLevel( block ); @@ -886,7 +890,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> //********************************************************************************************************************** inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> & point, const IBlock& block ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const uint_t level = getLevel( block ); @@ -906,7 +910,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> //********************************************************************************************************************** inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> & global, const IBlock& block, const Vector3<real_t> & local ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const uint_t level = getLevel( block ); @@ -927,7 +931,7 @@ inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> //********************************************************************************************************************** inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> & point, const IBlock& block ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const uint_t level = getLevel( block ); @@ -949,7 +953,7 @@ inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> //********************************************************************************************************************** inline void StructuredBlockStorage::transformGlobalToBlockLocalCell( Cell& local, const IBlock& block, const Cell& global ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const CellInterval& cellBB = getBlockCellBB( block ); @@ -981,7 +985,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocalCell( Cell& cell, //********************************************************************************************************************** inline void StructuredBlockStorage::transformBlockLocalToGlobalCell( Cell& global, const IBlock& block, const Cell& local ) const { - WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ); + WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) ) const CellInterval& cellBB = getBlockCellBB( block ); diff --git a/src/field/AddToStorage.h b/src/field/AddToStorage.h index 9477c4ff7c0b76e876b6e5fc0fe7da08eb36a7bc..d1ef11d921e4e305ada51f590a0a1035be5d6ec5 100644 --- a/src/field/AddToStorage.h +++ b/src/field/AddToStorage.h @@ -163,6 +163,24 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks, } +template< typename GhostLayerField_T, typename BlockStorage_T > +BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks, + const std::string & identifier, + const typename GhostLayerField_T::value_type & initValue, + const Layout layout, + const uint_t nrOfGhostLayers, + const shared_ptr< field::FieldAllocator<typename GhostLayerField_T::value_type> > alloc) +{ + auto alwaysInitialize = false; + auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(); + auto requiredSelectors = Set<SUID>::emptySet(); + auto incompatibleSelectors = Set<SUID>::emptySet(); + auto calculateSize = internal::defaultSize; + + return internal::AddToStorage< GhostLayerField_T, BlockStorage_T >::add( blocks, identifier, initValue, layout, nrOfGhostLayers, + alwaysInitialize, initFunction, requiredSelectors, incompatibleSelectors, calculateSize, alloc ); +} + template< typename GhostLayerField_T, typename BlockStorage_T > BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks, @@ -173,9 +191,9 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks, const bool alwaysInitialize, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) { + auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(); return addToStorage< GhostLayerField_T >( blocks, identifier, initValue, layout, nrOfGhostLayers, alwaysInitialize, - std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(), - requiredSelectors, incompatibleSelectors ); + initFunction, requiredSelectors, incompatibleSelectors ); } @@ -210,9 +228,9 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks, const bool alwaysInitialize, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) { + auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(); return addToStorage< GhostLayerField_T >( blocks, identifier, initValue, layout, nrOfGhostLayers, alwaysInitialize, - std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(), - requiredSelectors, incompatibleSelectors, calculateSize ); + initFunction, requiredSelectors, incompatibleSelectors, calculateSize ); } diff --git a/src/field/GhostLayerField.h b/src/field/GhostLayerField.h index f5fe40fe57e18e042c61c74f126c7d6792ebe11c..345d497cdc4c04a789d46d391b909f3fb0322aa5 100644 --- a/src/field/GhostLayerField.h +++ b/src/field/GhostLayerField.h @@ -202,6 +202,20 @@ namespace field { //@} //**************************************************************************************************************** + //** TimestepInformation ***************************************************************************************** + /*! \name TimestepCounter */ + //@{ + inline uint8_t advanceTimestep() + { + timestepCounter_ = (timestepCounter_ + 1) & 1; + return timestepCounter_; + } + inline uint8_t getTimestep() const { return timestepCounter_; } + inline uint8_t getTimestepPlusOne() const { return (timestepCounter_ + 1) & 1; } + inline bool isEvenTimeStep() const {return (((timestepCounter_) &1) ^ 1); } + //@} + //**************************************************************************************************************** + protected: GhostLayerField( ); @@ -221,6 +235,8 @@ namespace field { template <typename T2, uint_t fSize2> friend class GhostLayerField; + + uint8_t timestepCounter_; }; } // namespace field diff --git a/src/field/GhostLayerField.impl.h b/src/field/GhostLayerField.impl.h index d594274e78e27034b7a3d495e02a49dcaf8da28c..1a8b758ca7a9f717ed0786c4edbb7ed24195a410 100644 --- a/src/field/GhostLayerField.impl.h +++ b/src/field/GhostLayerField.impl.h @@ -44,7 +44,7 @@ namespace field { *******************************************************************************************************************/ template<typename T, uint_t fSize_> GhostLayerField<T,fSize_>::GhostLayerField( ) - : gl_(0) + : gl_(0), timestepCounter_(0) { } @@ -128,6 +128,7 @@ namespace field { const Layout & l, const shared_ptr<FieldAllocator<T> > &alloc) { gl_ = gl; + timestepCounter_ = uint8_c(0); uint_t innerGhostLayerSize = ( l == fzyx ) ? gl : uint_t(0); Field<T,fSize_>::init( _xSize + 2*gl , _ySize + 2*gl, @@ -689,7 +690,7 @@ namespace field { template<typename T, uint_t fSize_> GhostLayerField<T,fSize_>::GhostLayerField(const GhostLayerField<T,fSize_> & other) : Field<T,fSize_>::Field(other), - gl_( other.gl_ ) + gl_( other.gl_ ), timestepCounter_( other.timestepCounter_ ) { } @@ -700,7 +701,7 @@ namespace field { template <typename T2, uint_t fSize2> GhostLayerField<T,fSize_>::GhostLayerField(const GhostLayerField<T2,fSize2> & other) : Field<T,fSize_>::Field(other), - gl_( other.gl_ ) + gl_( other.gl_ ), timestepCounter_( other.timestepCounter_ ) { } @@ -756,11 +757,11 @@ namespace field { // Assert that there is still space for ghost-layers after slicing - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->xAllocSize() - uint_c(this->xOff()) - this->xSize(), gl_ ); - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->yAllocSize() - uint_c(this->yOff()) - this->ySize(), gl_ ); - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->zAllocSize() - uint_c(this->zOff()) - this->zSize(), gl_ ); } @@ -771,11 +772,11 @@ namespace field { Field<T,fSize_>::shiftCoordinates( cx, cy, cz ); // Assert that there is still space for ghost-layers after slicing - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->xAllocSize() - uint_c(this->xOff()) - this->xSize(), gl_ ); - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->yAllocSize() - uint_c(this->yOff()) - this->ySize(), gl_ ); - WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ ); + WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ ) WALBERLA_ASSERT_GREATER_EQUAL( this->zAllocSize() - uint_c(this->zOff()) - this->zSize(), gl_ ); } diff --git a/src/field/blockforest/BlockDataHandling.h b/src/field/blockforest/BlockDataHandling.h index c01306d4079bb68e2d59c5c3ae516947808b6575..5113b895a881e3c7ba29c4f541c4801a4d6f6dfe 100644 --- a/src/field/blockforest/BlockDataHandling.h +++ b/src/field/blockforest/BlockDataHandling.h @@ -92,12 +92,12 @@ protected: void sizeCheck( const uint_t xSize, const uint_t ySize, const uint_t zSize ) { - WALBERLA_CHECK( (xSize & uint_t(1)) == uint_t(0), "The x-size of your field must be divisible by 2." ); - WALBERLA_CHECK( (ySize & uint_t(1)) == uint_t(0), "The y-size of your field must be divisible by 2." ); + WALBERLA_CHECK( (xSize & uint_t(1)) == uint_t(0), "The x-size of your field must be divisible by 2." ) + WALBERLA_CHECK( (ySize & uint_t(1)) == uint_t(0), "The y-size of your field must be divisible by 2." ) if( Pseudo2D ) - { WALBERLA_CHECK( zSize == uint_t(1), "The z-size of your field must be equal to 1 (pseudo 2D mode)." ); } + { WALBERLA_CHECK( zSize == uint_t(1), "The z-size of your field must be equal to 1 (pseudo 2D mode)." ) } else - { WALBERLA_CHECK( (zSize & uint_t(1)) == uint_t(0), "The z-size of your field must be divisible by 2." ); } + { WALBERLA_CHECK( (zSize & uint_t(1)) == uint_t(0), "The z-size of your field must be divisible by 2." ) } } InitializationFunction_T initFunction_; @@ -110,7 +110,7 @@ template< typename Field_T, bool Pseudo2D > inline void BlockDataHandling< Field_T, Pseudo2D >::serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) { Field_T * field = block->template getData< Field_T >(id); - WALBERLA_ASSERT_NOT_NULLPTR( field ); + WALBERLA_ASSERT_NOT_NULLPTR( field ) #ifndef NDEBUG buffer << field->xSize() << field->ySize() << field->zSize() << field->fSize(); @@ -126,7 +126,7 @@ template< typename Field_T, bool Pseudo2D > void BlockDataHandling< Field_T, Pseudo2D >::serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child ) { Field_T * field = block->template getData< Field_T >(id); - WALBERLA_ASSERT_NOT_NULLPTR( field ); + WALBERLA_ASSERT_NOT_NULLPTR( field ) const uint_t xSize = field->xSize(); const uint_t ySize = field->ySize(); @@ -161,7 +161,7 @@ template< typename Field_T, bool Pseudo2D > void BlockDataHandling< Field_T, Pseudo2D >::serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) { Field_T * field = block->template getData< Field_T >(id); - WALBERLA_ASSERT_NOT_NULLPTR( field ); + WALBERLA_ASSERT_NOT_NULLPTR( field ) const uint_t xSize = field->xSize(); const uint_t ySize = field->ySize(); @@ -210,10 +210,10 @@ inline void BlockDataHandling< Field_T, Pseudo2D >::deserialize( IBlock * const uint_t zSender( uint_t(0) ); uint_t fSender( uint_t(0) ); buffer >> xSender >> ySender >> zSender >> fSender; - WALBERLA_ASSERT_EQUAL( xSender, field->xSize() ); - WALBERLA_ASSERT_EQUAL( ySender, field->ySize() ); - WALBERLA_ASSERT_EQUAL( zSender, field->zSize() ); - WALBERLA_ASSERT_EQUAL( fSender, field->fSize() ); + WALBERLA_ASSERT_EQUAL( xSender, field->xSize() ) + WALBERLA_ASSERT_EQUAL( ySender, field->ySize() ) + WALBERLA_ASSERT_EQUAL( zSender, field->zSize() ) + WALBERLA_ASSERT_EQUAL( fSender, field->fSize() ) #endif for( auto it = field->begin(); it != field->end(); ++it ) @@ -240,14 +240,14 @@ void BlockDataHandling< Field_T, Pseudo2D >::deserializeCoarseToFine( Block * co uint_t zSender( uint_t(0) ); uint_t fSender( uint_t(0) ); buffer >> branchId >> xSender >> ySender >> zSender >> fSender; - WALBERLA_ASSERT_EQUAL( branchId, block->getId().getBranchId() ); - WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) ); - WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) ); + WALBERLA_ASSERT_EQUAL( branchId, block->getId().getBranchId() ) + WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) ) + WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) ) if( Pseudo2D ) - { WALBERLA_ASSERT_EQUAL( zSender, zSize ); } + { WALBERLA_ASSERT_EQUAL( zSender, zSize ) } else - { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ); } - WALBERLA_ASSERT_EQUAL( fSender, fSize ); + { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ) } + WALBERLA_ASSERT_EQUAL( fSender, fSize ) #endif for( cell_idx_t z = cell_idx_t(0); z < cell_idx_c( zSize ); z += cell_idx_t(2) ) { @@ -295,14 +295,14 @@ void BlockDataHandling< Field_T, Pseudo2D >::deserializeFineToCoarse( Block * co uint_t zSender( uint_t(0) ); uint_t fSender( uint_t(0) ); buffer >> branchId >> xSender >> ySender >> zSender >> fSender; - WALBERLA_ASSERT_EQUAL( branchId, child ); - WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) ); - WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) ); + WALBERLA_ASSERT_EQUAL( branchId, child ) + WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) ) + WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) ) if( Pseudo2D ) - { WALBERLA_ASSERT_EQUAL( zSender, zSize ); } + { WALBERLA_ASSERT_EQUAL( zSender, zSize ) } else - { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ); } - WALBERLA_ASSERT_EQUAL( fSender, fSize ); + { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ) } + WALBERLA_ASSERT_EQUAL( fSender, fSize ) #endif const cell_idx_t zBegin = Pseudo2D ? cell_idx_t(0) : ( (child & uint_t(4)) ? ( cell_idx_c( zSize ) / cell_idx_t(2) ) : cell_idx_t(0) ); @@ -437,7 +437,7 @@ protected: GhostLayerField_T * allocate( IBlock * const block ) override { auto blocks = blocks_.lock(); - WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'DefaultBlockDataHandling' for a block storage object that doesn't exist anymore" ); + WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'DefaultBlockDataHandling' for a block storage object that doesn't exist anymore" ) const Vector3< uint_t > size = calculateSize_( blocks, block ); return internal::allocate< GhostLayerField_T >( size[0], size[1], size[2], nrOfGhostLayers_, initValue_, layout_, alloc_ ); diff --git a/src/field/communication/StencilRestrictedPackInfo.h b/src/field/communication/StencilRestrictedPackInfo.h index b82050340a359f0d8c91bb96343a4f0198116c7d..3ad7f9da2fcf231d2548b4dd2bdafd060269bec8 100644 --- a/src/field/communication/StencilRestrictedPackInfo.h +++ b/src/field/communication/StencilRestrictedPackInfo.h @@ -73,10 +73,10 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::unpackData( IBlock * return; GhostLayerField_T * pdfField = receiver->getData< GhostLayerField_T >( fieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( pdfField ); - WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 ); + WALBERLA_ASSERT_NOT_NULLPTR( pdfField ) + WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 ) - stencil::Direction packerDirection = stencil::inverseDir[dir]; + stencil::Direction const packerDirection = stencil::inverseDir[dir]; for(auto i = pdfField->beginGhostLayerOnlyXYZ(dir); i != pdfField->end(); ++i ) for(uint_t f = 0; f < Stencil::d_per_d_length[packerDirection]; ++f) @@ -94,7 +94,7 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::communicateLocal( co const GhostLayerField_T * sf = sender ->getData< GhostLayerField_T >( fieldId_ ); GhostLayerField_T * rf = receiver->getData< GhostLayerField_T >( fieldId_ ); - WALBERLA_ASSERT_EQUAL( sf->xyzSize(), rf->xyzSize() ); + WALBERLA_ASSERT_EQUAL( sf->xyzSize(), rf->xyzSize() ) typename GhostLayerField_T::const_iterator srcIter = sf->beginSliceBeforeGhostLayerXYZ(dir); typename GhostLayerField_T::iterator dstIter = rf->beginGhostLayerOnlyXYZ(stencil::inverseDir[dir]); @@ -107,8 +107,8 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::communicateLocal( co ++srcIter; ++dstIter; } - WALBERLA_ASSERT( srcIter == sf->end() ); - WALBERLA_ASSERT( dstIter == rf->end() ); + WALBERLA_ASSERT( srcIter == sf->end() ) + WALBERLA_ASSERT( dstIter == rf->end() ) } @@ -120,8 +120,8 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::packDataImpl( const return; const GhostLayerField_T * pdfField = sender->getData< GhostLayerField_T >( fieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( pdfField ); - WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 ); + WALBERLA_ASSERT_NOT_NULLPTR( pdfField ) + WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 ) for( auto i = pdfField->beginSliceBeforeGhostLayerXYZ(dir); i != pdfField->end(); ++i ) for(uint_t f = 0; f < Stencil::d_per_d_length[dir]; ++f) diff --git a/src/geometry/InitBoundaryHandling.h b/src/geometry/InitBoundaryHandling.h index fe6817d3ead08c2b7bb144f9564dd8ea8d03809d..bcea3de9305e52f37a43cb11dec6f8ea73e6ff49 100644 --- a/src/geometry/InitBoundaryHandling.h +++ b/src/geometry/InitBoundaryHandling.h @@ -141,6 +141,21 @@ void setNonBoundaryCellsToDomain( StructuredBlockStorage & blocks, BlockDataID f } } +template<typename FlagField_T> +void setNonBoundaryCellsToDomain( StructuredBlockStorage & blocks, BlockDataID flagFieldID, + field::FlagUID fluidFlagID, cell_idx_t numGhostLayers) +{ + for( auto blockIt = blocks.begin(); blockIt != blocks.end(); ++blockIt ) + { + auto flagField = blockIt->template getData<FlagField_T>( flagFieldID ); + auto fluidFlag = flagField->getOrRegisterFlag(fluidFlagID); + for( auto it = flagField->beginWithGhostLayerXYZ(numGhostLayers); it != flagField->end(); ++it ) + if ( *it == 0 ) + addFlag(it, fluidFlag); + } +} + + } // namespace geometry } // namespace walberla diff --git a/src/gpu/AddGPUFieldToStorage.impl.h b/src/gpu/AddGPUFieldToStorage.impl.h index e016f93fb47c34f0073d87813c96506bf11f58f8..610b853265cf7b94ad5be81bb1bd9444ce2b008b 100644 --- a/src/gpu/AddGPUFieldToStorage.impl.h +++ b/src/gpu/AddGPUFieldToStorage.impl.h @@ -93,5 +93,3 @@ namespace gpu } // namespace gpu } // namespace walberla - - diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt index 1790af12b470b84a9559de60931b88523956a694..a8e58ab49e46aac0914d9b5b8f482855f9b50d2a 100644 --- a/src/gpu/CMakeLists.txt +++ b/src/gpu/CMakeLists.txt @@ -35,7 +35,7 @@ target_sources( gpu FieldIndexing.impl.h Kernel.h ParallelStreams.h - CudaRAII.h + GPURAII.h DeviceSelectMPI.cpp ) diff --git a/src/gpu/ErrorChecking.h b/src/gpu/ErrorChecking.h index 0d1316eccf17130e717824559a2f0e3051c834b2..a80ef03343712257efcfc512b2fa7062bbcab21c 100644 --- a/src/gpu/ErrorChecking.h +++ b/src/gpu/ErrorChecking.h @@ -28,16 +28,18 @@ #include "gpu/GPUWrapper.h" namespace walberla { -namespace gpu -{ +namespace gpu { #define WALBERLA_GPU_CHECK(ans) { ::walberla::gpu::checkForError((ans), __FILE__, __LINE__); } +#define WALBERLA_GPU_CHECK_LAST_ERROR() {::walberla::gpu::checkForLastError(__FILE__, __LINE__);} inline void checkForError( gpuError_t code, const std::string & callerPath, const int line ) { + // Oftentimes CUDA functions return an error code (if error has occurred) This function converts the error string in human-readable output. + // For general error checking use checkForLastError if(code != gpuSuccess) { std::stringstream ss; @@ -46,6 +48,21 @@ inline void checkForError( gpuError_t code, const std::string & callerPath, cons } } +#ifndef NDEBUG +inline void checkForLastError( const std::string & callerPath, const int line ) +{ + // Forces immediate checking with a synchronizing. This breaks asynchrony/concurrency structure. Thus, only in debug mode executed. + gpuError_t code = gpuGetLastError(); + if(code != gpuSuccess) + { + std::stringstream ss; + ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code ); + Abort::instance()->abort( ss.str(), callerPath, line ); + } +} +#else +inline void checkForLastError( const std::string & /*callerPath*/, const int /*line*/ ){} +#endif } // namespace gpu diff --git a/src/gpu/GPUField.h b/src/gpu/GPUField.h index a286b8dca2cca9a49d2a93455e865744efe649b5..ecc9ccc5b1dfff0468c676b1262247a9df36add9 100755 --- a/src/gpu/GPUField.h +++ b/src/gpu/GPUField.h @@ -125,9 +125,9 @@ namespace gpu bool operator==( const GPUField & other ) const; void getGhostRegion( stencil::Direction d, CellInterval & ci, - cell_idx_t thickness, bool fullSlice ) const; + cell_idx_t thickness, bool fullSlice = false ) const; void getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, - cell_idx_t thickness, bool fullSlice ) const + cell_idx_t thickness, bool fullSlice = false ) const { getSlice( d, ci, 0, thickness, fullSlice ); } @@ -140,6 +140,20 @@ namespace gpu T * dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f); const T * dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f) const; + //** TimestepInformation ***************************************************************************************** + /*! \name TimestepCounter */ + //@{ + inline uint8_t advanceTimestep() + { + timestepCounter_ = (timestepCounter_ + 1) & 1; + return timestepCounter_; + } + inline uint8_t getTimestep() const { return timestepCounter_; } + inline uint8_t getTimestepPlusOne() const { return (timestepCounter_ + 1) & 1; } + inline bool isEvenTimeStep() const {return (((timestepCounter_) &1) ^ 1); } + //@} + //**************************************************************************************************************** + protected: gpuPitchedPtr pitchedPtr_; uint_t nrOfGhostLayers_; @@ -152,6 +166,7 @@ namespace gpu uint_t fAllocSize_; Layout layout_; bool usePitchedMem_; + uint8_t timestepCounter_; }; diff --git a/src/gpu/GPUField.impl.h b/src/gpu/GPUField.impl.h index 221440f5c953485f7bf45b30730980d36837cf3a..9c1242aa92dcecf30ff0a1520faf151723ce2fd1 100644 --- a/src/gpu/GPUField.impl.h +++ b/src/gpu/GPUField.impl.h @@ -34,7 +34,7 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz uint_t _nrOfGhostLayers, const Layout & _layout, bool usePitchedMem ) : nrOfGhostLayers_( _nrOfGhostLayers ), xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ), - layout_( _layout ), usePitchedMem_( usePitchedMem ) + layout_( _layout ), usePitchedMem_( usePitchedMem ), timestepCounter_(0) { gpuExtent extent; if ( layout_ == zyxf ) @@ -61,12 +61,13 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz } else { - pitchedPtr_ = make_gpuPitchedPtr( nullptr, extent.width, extent.width, extent.height ); + pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height ); WALBERLA_GPU_CHECK ( gpuMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) ) } - // allocation size is stored in pitched pointer which stores the amount of padded region in bytes - // we keep track of the size in #elements + // allocation size is stored in pitched pointer + // pitched pointer stores the amount of padded region in bytes + // but we keep track of the size in #elements WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 ) if ( layout_ == field::fzyx ) { diff --git a/src/gpu/CudaRAII.h b/src/gpu/GPURAII.h similarity index 100% rename from src/gpu/CudaRAII.h rename to src/gpu/GPURAII.h diff --git a/src/gpu/GPUWrapper.h b/src/gpu/GPUWrapper.h index 1abbc22895d7b9d6284c0a2661bcb14ca1421aeb..48fcc2e1064ce32c525eed2c43896195cf059784 100644 --- a/src/gpu/GPUWrapper.h +++ b/src/gpu/GPUWrapper.h @@ -31,6 +31,7 @@ #define gpuGetErrorName cudaGetErrorName #define gpuGetErrorString cudaGetErrorString #define gpuPeekAtLastError cudaPeekAtLastError + #define gpuGetLastError cudaGetLastError #define gpuMalloc cudaMalloc #define gpuMallocHost cudaMallocHost @@ -87,6 +88,7 @@ #define gpuGetErrorName hipGetErrorName #define gpuGetErrorString hipGetErrorString #define gpuPeekAtLastError hipPeekAtLastError + #define gpuGetLastError hipGetLastError #define gpuMalloc hipMalloc #define gpuMallocHost hipHostMalloc diff --git a/src/gpu/HostFieldAllocator.h b/src/gpu/HostFieldAllocator.h index 98892aebadc7fd1c4b03ffc38f16bd26631e4123..2b7311addf3a8a7dce8c5804d9bf3ecaee9a7501 100644 --- a/src/gpu/HostFieldAllocator.h +++ b/src/gpu/HostFieldAllocator.h @@ -36,7 +36,7 @@ namespace gpu * Allocator that allocates a CPU! field using gpuHostAlloc without padding * * Uses gpuHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU - * This allocator should be used for CPU fields that are often transfered to GPU and back + * This allocator should be used for CPU fields that are often transferred to GPU and back * * \ingroup gpu * diff --git a/src/gpu/ParallelStreams.h b/src/gpu/ParallelStreams.h index fd83932766abfe6d0d177a2a09995a4d98f3ff77..0eca060569adf0e404c510f7847d9348f535df0f 100644 --- a/src/gpu/ParallelStreams.h +++ b/src/gpu/ParallelStreams.h @@ -19,11 +19,11 @@ // //====================================================================================================================== #pragma once -#include "gpu/ErrorChecking.h" -#include "gpu/CudaRAII.h" - #include <vector> +#include "gpu/ErrorChecking.h" +#include "gpu/GPURAII.h" + namespace walberla { namespace gpu { diff --git a/src/gpu/communication/CMakeLists.txt b/src/gpu/communication/CMakeLists.txt index 98bbff2016d5b9d3dd6c26de2774888e1e0cc257..7b9c0cced315353be228779dc3e4dfc96764efc7 100644 --- a/src/gpu/communication/CMakeLists.txt +++ b/src/gpu/communication/CMakeLists.txt @@ -7,5 +7,7 @@ target_sources( gpu GPUPackInfo.h CustomMemoryBuffer.h UniformGPUScheme.h - GeneratedGPUPackInfo.h + NonUniformGPUScheme.h + GeneratedGPUPackInfo.h + GeneratedNonUniformGPUPackInfo.h ) diff --git a/src/gpu/communication/CustomMemoryBuffer.h b/src/gpu/communication/CustomMemoryBuffer.h index 26a6743f3ef4aacb436a5e08ceff69ed68241ef0..e01e873708d84788fcecfb33a83ab3616b07c752 100644 --- a/src/gpu/communication/CustomMemoryBuffer.h +++ b/src/gpu/communication/CustomMemoryBuffer.h @@ -62,7 +62,7 @@ namespace communication { class CustomMemoryBuffer { public: - typedef uint8_t ElementType; + using ElementType = uint8_t; explicit CustomMemoryBuffer(); explicit CustomMemoryBuffer( std::size_t initSize ); @@ -74,6 +74,7 @@ namespace communication { inline std::size_t allocSize() const { return std::size_t(end_ - begin_); } inline std::size_t size() const { return std::size_t(cur_ - begin_); } ElementType *ptr() const { return begin_; } + ElementType *cur() const { return cur_; } inline void clear() { cur_ = begin_; } diff --git a/src/gpu/communication/GPUPackInfo.h b/src/gpu/communication/GPUPackInfo.h index 3922690445c7f11fb8375596a9f9f740c076727a..c34600f29b2219088c29b0d5ff2e9fb1dc4a1142 100644 --- a/src/gpu/communication/GPUPackInfo.h +++ b/src/gpu/communication/GPUPackInfo.h @@ -43,10 +43,7 @@ #include "gpu/GPUWrapper.h" #include "gpu/communication/CustomMemoryBuffer.h" -namespace walberla { -namespace gpu -{ -namespace communication { +namespace walberla::gpu::communication { /** @@ -299,8 +296,4 @@ uint_t GPUPackInfo<GPUField_T>::numberOfGhostLayersToCommunicate( const GPUField } } - - -} // namespace communication -} // namespace gpu -} // namespace walberla +} // namespace walberla::gpu::communication diff --git a/src/gpu/communication/GeneratedGPUPackInfo.h b/src/gpu/communication/GeneratedGPUPackInfo.h index 9ca4afb9c575446ea8734d0cbf302819b5160b05..f5f6c98b60b529045a1877a435fcacacb9359a95 100644 --- a/src/gpu/communication/GeneratedGPUPackInfo.h +++ b/src/gpu/communication/GeneratedGPUPackInfo.h @@ -19,28 +19,25 @@ // //====================================================================================================================== - #pragma once #include "domain_decomposition/IBlock.h" -#include "stencil/Directions.h" - #include "gpu/GPUWrapper.h" -namespace walberla { -namespace gpu -{ +#include "stencil/Directions.h" +namespace walberla::gpu { class GeneratedGPUPackInfo { public: + GeneratedGPUPackInfo() = default; + virtual ~GeneratedGPUPackInfo() = default; + virtual void pack ( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0; + virtual void communicateLocal ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) = 0; virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0; virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0; }; - - -} //namespace gpu -} //namespace walberla \ No newline at end of file +} //namespace walberla::gpu \ No newline at end of file diff --git a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..f6b39d9b0fe0dd1b9c90c5d63eb7b8ca00bd3d0f --- /dev/null +++ b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h @@ -0,0 +1,159 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GeneratedNonUniformGPUPackInfo.h +//! \ingroup gpu +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/Block.h" +#include "blockforest/BlockID.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/communication/CustomMemoryBuffer.h" + +#include "stencil/Directions.h" + +using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer; + + +namespace walberla::gpu { + + +class GeneratedNonUniformGPUPackInfo +{ + public: + using VoidFunction = std::function< void( gpuStream_t) >; + GeneratedNonUniformGPUPackInfo() = default; + virtual ~GeneratedNonUniformGPUPackInfo() = default; + + virtual bool constantDataExchange() const = 0; + virtual bool threadsafeReceiving() const = 0; + + inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer) const; + virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir, gpuStream_t stream) = 0; + virtual void getLocalEqualLevelCommFunction( std::vector< VoidFunction >& commFunctions, const Block * sender, Block * receiver, stencil::Direction dir) = 0; + + inline void packDataCoarseToFine ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const; + virtual void unpackDataCoarseToFine ( Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir ) = 0; + virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; + virtual void getLocalCoarseToFineCommFunction( std::vector< VoidFunction >& commFunctions, const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + + inline void packDataFineToCoarse ( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const; + virtual void unpackDataFineToCoarse ( Block * coarseReceiver, const BlockID & fineSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir) = 0; + virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; + virtual void getLocalFineToCoarseCommFunction( std::vector< VoidFunction >& commFunctions, const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + + virtual uint_t sizeEqualLevelSend( const Block * sender, stencil::Direction dir) = 0; + virtual uint_t sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir) = 0; + virtual uint_t sizeFineToCoarseSend ( const Block * fineSender, stencil::Direction dir) = 0; + + +#ifndef NDEBUG + void clearBufferSizeCheckMap() { bufferSize_.clear(); } +#endif + + protected: + virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; + virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; + virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; + +#ifndef NDEBUG + mutable std::map< const Block *, std::map< stencil::Direction, std::map< uint_t, size_t > > > bufferSize_; +#endif + +}; + +inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer ) const +{ +#ifndef NDEBUG + size_t const sizeBefore = buffer.size(); +#endif + + packDataEqualLevelImpl( sender, dir, buffer ); + +#ifndef NDEBUG +size_t const sizeAfter = buffer.size(); +if( constantDataExchange() ) +{ + auto & blockMap = bufferSize_[ sender ]; + auto & sizeMap = blockMap[ dir ]; + auto dirEntry = sizeMap.find( uint_t(0) ); + if( dirEntry == sizeMap.end() ) + sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore; + else + WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ) +} +#endif +} + + + +inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const +{ +#ifndef NDEBUG + size_t const sizeBefore = buffer.size(); +#endif + + packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer ); + +#ifndef NDEBUG +size_t const sizeAfter = buffer.size(); +if( constantDataExchange() ) +{ + auto & blockMap = bufferSize_[ coarseSender ]; + auto & sizeMap = blockMap[ dir ]; + auto dirEntry = sizeMap.find( fineReceiver.getBranchId() ); + if( dirEntry == sizeMap.end() ) + sizeMap[ fineReceiver.getBranchId() ] = sizeAfter - sizeBefore; + else + WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) ) +} +#endif +} + + + +inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const +{ +#ifndef NDEBUG + size_t const sizeBefore = buffer.size(); +#endif + + packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer ); + +#ifndef NDEBUG +size_t const sizeAfter = buffer.size(); +if( constantDataExchange() ) +{ + auto & blockMap = bufferSize_[ fineSender ]; + auto & sizeMap = blockMap[ dir ]; + auto dirEntry = sizeMap.find( uint_t(0) ); + if( dirEntry == sizeMap.end() ) + sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore; + else + WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) ) +} +#endif +} + + +} //namespace walberla::gpu \ No newline at end of file diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h index c5e58d2a395d71337409f9e8da4b5e039a79dd6c..6c15988f4f2687275fea7f0f8be36b2e7d99fcf6 100644 --- a/src/gpu/communication/MemcpyPackInfo.h +++ b/src/gpu/communication/MemcpyPackInfo.h @@ -11,20 +11,17 @@ #include "gpu/GPUWrapper.h" #include "gpu/communication/GeneratedGPUPackInfo.h" -namespace walberla { -namespace gpu -{ -namespace communication { +namespace walberla::gpu::communication { template<typename GPUFieldType> class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo { public: - MemcpyPackInfo( BlockDataID pdfsID_ ) - : pdfsID(pdfsID_) {}; - virtual ~MemcpyPackInfo() = default; + MemcpyPackInfo( BlockDataID pdfsID_ ) : pdfsID(pdfsID_) {}; + ~MemcpyPackInfo() override = default; void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; + void communicateLocal ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override; void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override; uint_t size(stencil::Direction dir, IBlock * block) override; @@ -36,8 +33,6 @@ private: uint_t numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const; }; -} // namespace communication -} // namespace gpu -} // namespace walberla +} // namespace walberla::gpu::communication #include "MemcpyPackInfo.impl.h" diff --git a/src/gpu/communication/MemcpyPackInfo.impl.h b/src/gpu/communication/MemcpyPackInfo.impl.h index 486871d4e0e7563e8b890b91bfc5aa814775d74a..2110933cda5322828f40cc14b471be5c6a309bfe 100644 --- a/src/gpu/communication/MemcpyPackInfo.impl.h +++ b/src/gpu/communication/MemcpyPackInfo.impl.h @@ -23,7 +23,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr ) // cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); - CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false ); + CellInterval fieldCi = field::getSliceBeforeGhostLayer( *fieldPtr, dir, nrOfGhostLayers, false ); // Base offsets into the buffer and GPUField, respectively auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); @@ -65,6 +65,65 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char } } +template<typename GPUFieldType> +void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream ) +{ + // WALBERLA_ABORT("The MemcpyPackInfo does not provide a thread safe local communication. Thus is can not be used in local mode. To use it set local useLocalCommunication to false in the communication scheme") + + + // Extract field data pointer from the block + const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( pdfsID ); + const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( pdfsID ); + WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr ) + WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr ) + + // + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( senderFieldPtr ) ); + WALBERLA_ASSERT_EQUAL(nrOfGhostLayers, cell_idx_c( numberOfGhostLayersToCommunicate( receiverFieldPtr ))) + WALBERLA_ASSERT_EQUAL(senderFieldPtr->layout(), receiverFieldPtr->layout() ) + WALBERLA_ASSERT_EQUAL(senderFieldPtr->fSize(), receiverFieldPtr->fSize() ) + + CellInterval senderCi = field::getSliceBeforeGhostLayer( *senderFieldPtr, dir, nrOfGhostLayers, false ); + CellInterval receiverCi = field::getGhostRegion( *receiverFieldPtr, stencil::inverseDir[dir], nrOfGhostLayers, false ); + + // Base offsets into the buffer and GPUField, respectively + auto srcOffset = std::make_tuple( uint_c(senderCi.xMin() + nrOfGhostLayers), + uint_c(senderCi.yMin() + nrOfGhostLayers), + uint_c(senderCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + + auto dstOffset = std::make_tuple( uint_c(receiverCi.xMin() + nrOfGhostLayers), + uint_c(receiverCi.yMin() + nrOfGhostLayers), + uint_c(receiverCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + + + // Size of data to pack, in terms of elements of the field + auto intervalSize = std::make_tuple( senderCi.xSize(), senderCi.ySize(), + senderCi.zSize(), senderFieldPtr->fSize() ); + + WALBERLA_ASSERT_EQUAL(intervalSize, std::make_tuple( receiverCi.xSize(), receiverCi.ySize(), receiverCi.zSize(), receiverFieldPtr->fSize() )) + + if ( senderFieldPtr->layout() == field::fzyx ) + { + const uint_t dstAllocSizeZ = receiverFieldPtr->zAllocSize(); + const uint_t srcAllocSizeZ = senderFieldPtr->zAllocSize(); + + copyDevToDevFZYX( receiverFieldPtr->pitchedPtr(), senderFieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } + else + { + const uint_t dstAllocSizeZ = receiverFieldPtr->yAllocSize(); + const uint_t srcAllocSizeZ = senderFieldPtr->yAllocSize(); + + copyDevToDevZYXF( receiverFieldPtr->pitchedPtr(), senderFieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } +} + template<typename GPUFieldType> void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream) @@ -75,7 +134,6 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false ); - auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), uint_c(fieldCi.yMin() + nrOfGhostLayers), uint_c(fieldCi.zMin() + nrOfGhostLayers), @@ -208,7 +266,7 @@ uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * blo return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type); */ - uint_t totalCells = ci.xSize() * ci.ySize() * ci.zSize() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type); + uint_t totalCells = ci.numCells() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type); return totalCells; } diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h new file mode 100644 index 0000000000000000000000000000000000000000..28a8fd0d0b7534e30957e938389f57d28d7ebeef --- /dev/null +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -0,0 +1,961 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonUniformGPUScheme.h +//! \ingroup gpu +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/StructuredBlockForest.h" + +#include "core/mpi/BufferSystem.h" +#include "core/mpi/MPIWrapper.h" + +#include "domain_decomposition/IBlock.h" + +#include "stencil/Directions.h" + +#include <thread> + +#include "gpu/ErrorChecking.h" +#include "gpu/GPURAII.h" +#include "gpu/GPUWrapper.h" +#include "gpu/ParallelStreams.h" +#include "gpu/communication/CustomMemoryBuffer.h" +#include "gpu/communication/GeneratedNonUniformGPUPackInfo.h" + +namespace walberla::gpu::communication +{ + +template< typename Stencil > +class NonUniformGPUScheme +{ + public: + enum INDEX { EQUAL_LEVEL = 0, COARSE_TO_FINE = 1, FINE_TO_COARSE = 2 }; + + using CpuBuffer_T = walberla::gpu::communication::PinnedMemoryBuffer; + using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer; + + explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU = false, + const int tag = 5432); + + explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, const Set< SUID >& requiredBlockSelectors, + const Set< SUID >& incompatibleBlockSelectors, bool sendDirectlyFromGPU = false, + const int tag = 5432); + + ~NonUniformGPUScheme(); + + //** Pack Info Registration ***************************************************************************************** + /*! \name Pack Info Registration */ + //@{ + void addPackInfo(const shared_ptr< GeneratedNonUniformGPUPackInfo >& pi); + //@} + //******************************************************************************************************************* + + inline void communicateEqualLevel(const uint_t level); + inline void communicateCoarseToFine(const uint_t fineLevel); + inline void communicateFineToCoarse(const uint_t fineLevel); + + std::function<void()> communicateEqualLevelFunctor(const uint_t level) { + return [level, this](){ NonUniformGPUScheme::communicateEqualLevel(level);}; + } + std::function<void()> communicateCoarseToFineFunctor(const uint_t fineLevel) { + return [fineLevel, this](){ NonUniformGPUScheme::communicateCoarseToFine(fineLevel);}; + } + std::function<void()> communicateFineToCoarseFunctor(const uint_t fineLevel) { + return [fineLevel, this](){ NonUniformGPUScheme::communicateFineToCoarse(fineLevel);}; + } + + inline void startCommunicateEqualLevel(const uint_t level); + inline void startCommunicateCoarseToFine(const uint_t fineLevel); + inline void startCommunicateFineToCoarse(const uint_t fineLevel); + + inline void waitCommunicateEqualLevel(const uint_t level); + inline void waitCommunicateCoarseToFine(const uint_t fineLevel); + inline void waitCommunicateFineToCoarse(const uint_t fineLevel); + + private: + void setupCommunication(); + + void init(); + void refresh(); + + bool isAnyCommunicationInProgress() const; + + void startCommunicationEqualLevel(const uint_t index, std::set< uint_t >& participatingLevels); + void startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel); + void startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel); + + weak_ptr< StructuredBlockForest > blockForest_; + uint_t forestModificationStamp_; + + std::vector< std::vector< bool > > communicationInProgress_; + bool sendFromGPU_; + int baseTag_; + + std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_; + std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_; + std::vector< std::vector< GpuBuffer_T > > localBuffer_; + + std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_; + + ParallelStreams parallelSectionManager_; + + struct Header + { + BlockID receiverId; + BlockID senderId; + stencil::Direction dir; + }; + std::vector< std::vector< std::map< mpi::MPIRank, std::vector< Header > > > > headers_; + + Set< SUID > requiredBlockSelectors_; + Set< SUID > incompatibleBlockSelectors_; +}; + +template< typename Stencil > +NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU, + const int tag) + : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), parallelSectionManager_(-1), + requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet()) +{ + init(); +} + +template< typename Stencil > +NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, + const Set< SUID >& requiredBlockSelectors, + const Set< SUID >& incompatibleBlockSelectors, + bool sendDirectlyFromGPU, const int tag) + : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors), + incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), + parallelSectionManager_(-1) +{ + init(); +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::init() +{ + bufferSystemCPU_.resize(3); + bufferSystemGPU_.resize(3); + localBuffer_.resize(3); + headers_.resize(3); + + communicationInProgress_.resize(3); + + refresh(); +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::refresh() +{ + WALBERLA_ASSERT(!isAnyCommunicationInProgress()) + + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + const uint_t levels = forest->getNumberOfLevels(); + + for (uint_t i = 0; i != 3; ++i) + { + bufferSystemCPU_[i].clear(); + bufferSystemGPU_[i].clear(); + localBuffer_[i].clear(); + headers_[i].clear(); + headers_[i].resize(size_t(levels + uint_t(1))); + + for (uint_t j = 0; j <= levels; ++j) + { + headers_[i][j].clear(); + bufferSystemCPU_[i].emplace_back( + mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); + bufferSystemGPU_[i].emplace_back( + mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); + localBuffer_[i].emplace_back(); + } + + communicationInProgress_[i].resize(size_t(levels + uint_t(1)), false); + } + +#ifndef NDEBUG + for (auto p = packInfos_.begin(); p != packInfos_.end(); ++p) + (*p)->clearBufferSizeCheckMap(); +#endif + + forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::communicateEqualLevel(const uint_t level) +{ + startCommunicateEqualLevel(level); + waitCommunicateEqualLevel(level); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::communicateCoarseToFine(const uint_t fineLevel) +{ + startCommunicateCoarseToFine(fineLevel); + waitCommunicateCoarseToFine(fineLevel); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::communicateFineToCoarse(const uint_t fineLevel) +{ + startCommunicateFineToCoarse(fineLevel); + waitCommunicateFineToCoarse(fineLevel); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::startCommunicateEqualLevel(const uint_t level) +{ + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_LESS(level, forest->getNumberOfLevels()) + + if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh(); + + std::set< uint_t > participatingLevels; + participatingLevels.insert(level); + + startCommunicationEqualLevel(level, participatingLevels); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::startCommunicateCoarseToFine(const uint_t fineLevel) +{ + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0)) + WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) + + if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh(); + + const uint_t coarsestLevel = fineLevel - uint_t(1); + + startCommunicationCoarseToFine(fineLevel, coarsestLevel); +} + +template< typename Stencil > +inline void NonUniformGPUScheme< Stencil >::startCommunicateFineToCoarse(const uint_t fineLevel) +{ + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0)) + WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) + + if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh(); + + const uint_t finestLevel = fineLevel; + + startCommunicationFineToCoarse(fineLevel, finestLevel); +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t index, + std::set< uint_t >& participatingLevels) +{ + if (packInfos_.empty()) return; + + WALBERLA_ASSERT(!communicationInProgress_[EQUAL_LEVEL][index]) + communicationInProgress_[EQUAL_LEVEL][index] = true; + + auto forest = blockForest_.lock(); + + // Schedule Receives + if (sendFromGPU_) + bufferSystemGPU_[EQUAL_LEVEL][index].scheduleReceives(); + else + bufferSystemCPU_[EQUAL_LEVEL][index].scheduleReceives(); + + if (!sendFromGPU_) + for (auto it : headers_[EQUAL_LEVEL][index]) + bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear(); + + // Start filling send buffers + { + for (auto& iBlock : *forest) + { + auto senderBlock = dynamic_cast< Block* >(&iBlock); + + if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; + + if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end()) + continue; + + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + + if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx))) + continue; + WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; + + if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + { + auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); + for (auto& pi : packInfos_) + { + pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, nullptr); + } + } + else + { + auto nProcess = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0))); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess); + + for (auto& pi : packInfos_) + { + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeEqualLevelSend(senderBlock, *dir)) + + pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer); + + if (!sendFromGPU_) + { + auto gpuDataPtr = gpuDataBuffer.cur(); + auto size = pi->sizeEqualLevelSend(senderBlock, *dir); + auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) + } + } + } + } + } + } + + // wait for packing to finish + WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + + + if (sendFromGPU_) + bufferSystemGPU_[EQUAL_LEVEL][index].sendAll(); + else + bufferSystemCPU_[EQUAL_LEVEL][index].sendAll(); + + communicationInProgress_[EQUAL_LEVEL][index] = true; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel) +{ + if (packInfos_.empty()) return; + WALBERLA_ASSERT(!communicationInProgress_[COARSE_TO_FINE][index]) + communicationInProgress_[COARSE_TO_FINE][index] = true; + + auto forest = blockForest_.lock(); + + // Schedule Receives + if (sendFromGPU_) + bufferSystemGPU_[COARSE_TO_FINE][index].scheduleReceives(); + else + bufferSystemCPU_[COARSE_TO_FINE][index].scheduleReceives(); + + if (!sendFromGPU_) + for (auto it : headers_[COARSE_TO_FINE][index]) + bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear(); + + // Start filling send buffers + { + for (auto& iBlock : *forest) + { + auto coarseBlock = dynamic_cast< Block* >(&iBlock); + auto nLevel = coarseBlock->getLevel(); + + if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; + + if (nLevel != coarsestLevel) continue; + + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + + if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue; + + for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n) + { + const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n); + if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + + if( coarseBlock->neighborExistsLocally( neighborIdx, n ) ) + { + auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) ); + // for (auto& pi : packInfos_) + // { + // pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir); + // } + + GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index]; + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + + for (auto& pi : packInfos_) + { + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr); + } + } + else + { + auto nProcess = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n)); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess); + for (auto& pi : packInfos_) + { + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) + + pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer); + + if (!sendFromGPU_) + { + auto gpuDataPtr = gpuDataBuffer.cur(); + auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir); + auto cpuDataPtr = + bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) + } + } + } + } + } + localBuffer_[COARSE_TO_FINE][index].clear(); + } + } + + // wait for packing to finish + WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + + if (sendFromGPU_) + bufferSystemGPU_[COARSE_TO_FINE][index].sendAll(); + else + bufferSystemCPU_[COARSE_TO_FINE][index].sendAll(); + + communicationInProgress_[COARSE_TO_FINE][index] = true; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel) +{ + if (packInfos_.empty()) return; + + WALBERLA_ASSERT(!communicationInProgress_[FINE_TO_COARSE][index]) + + communicationInProgress_[FINE_TO_COARSE][index] = true; + + auto forest = blockForest_.lock(); + + // Schedule Receives + if (sendFromGPU_) + bufferSystemGPU_[FINE_TO_COARSE][index].scheduleReceives(); + else + bufferSystemCPU_[FINE_TO_COARSE][index].scheduleReceives(); + + if (!sendFromGPU_) + for (auto it : headers_[FINE_TO_COARSE][index]) + bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear(); + + // Start filling send buffers + { + for (auto& iBlock : *forest) + { + auto fineBlock = dynamic_cast< Block* >(&iBlock); + auto nLevel = fineBlock->getLevel(); + + if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; + + if (nLevel != finestLevel) continue; + + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + + if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue; + WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + + const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0)); + if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + { + auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) ); + // for (auto& pi : packInfos_) + // { + // pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir); + // } + + GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index]; + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + for (auto& pi : packInfos_) + { + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr); + } + } + else + { + auto nProcess = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0))); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess); + + for (auto& pi : packInfos_) + { + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) + + pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer); + + if (!sendFromGPU_) + { + auto gpuDataPtr = gpuDataBuffer.cur(); + auto size = pi->sizeFineToCoarseSend(fineBlock, *dir); + auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) + } + } + } + } + localBuffer_[FINE_TO_COARSE][index].clear(); + } + } + + // wait for packing to finish + WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + + if (sendFromGPU_) + bufferSystemGPU_[FINE_TO_COARSE][index].sendAll(); + else + bufferSystemCPU_[FINE_TO_COARSE][index].sendAll(); + + communicationInProgress_[FINE_TO_COARSE][index] = true; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t level) +{ + if (!communicationInProgress_[EQUAL_LEVEL][level] || packInfos_.empty()) return; + + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_LESS(level, forest->getNumberOfLevels()) + + if (sendFromGPU_) + { + // auto parallelSection = parallelSectionManager_.parallelSection( nullptr ); + for (auto recvInfo = bufferSystemGPU_[EQUAL_LEVEL][level].begin(); + recvInfo != bufferSystemGPU_[EQUAL_LEVEL][level].end(); ++recvInfo) + { + recvInfo.buffer().clear(); + for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + + for (auto& pi : packInfos_) + { + GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + // parallelSection.run([&](auto s) { + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer); + // }); + } + } + } + } + else + { + for (auto recvInfo = bufferSystemCPU_[EQUAL_LEVEL][level].begin(); + recvInfo != bufferSystemCPU_[EQUAL_LEVEL][level].end(); ++recvInfo) + { + auto& gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank()); + + recvInfo.buffer().clear(); + gpuBuffer.clear(); + for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); + + for (auto& pi : packInfos_) + { + auto size = pi->sizeEqualLevelSend(senderBlock, header.dir); + auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); + auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) + + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr)) + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer); + } + } + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + } + communicationInProgress_[EQUAL_LEVEL][level] = false; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fineLevel) +{ + if (!communicationInProgress_[COARSE_TO_FINE][fineLevel] || packInfos_.empty()) return; + + WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0)) + + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) + + if (sendFromGPU_) + { + // auto parallelSection = parallelSectionManager_.parallelSection( nullptr ); + for (auto recvInfo = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].begin(); + recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) + { + recvInfo.buffer().clear(); + for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); + + for (auto& pi : packInfos_) + { + // auto size = pi->sizeCoarseToFineSend( senderBlock, block->getId(), header.dir ); + GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + // parallelSection.run([&](auto s) { + pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer); + // }); + } + } + } + } + else + { + auto parallelSection = parallelSectionManager_.parallelSection(nullptr); + for (auto recvInfo = bufferSystemCPU_[COARSE_TO_FINE][fineLevel].begin(); + recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) + { + auto& gpuBuffer = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].sendBuffer(recvInfo.rank()); + + recvInfo.buffer().clear(); + gpuBuffer.clear(); + for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); + + for (auto& pi : packInfos_) + { + auto size = pi->sizeCoarseToFineSend(senderBlock, block->getId(), header.dir); + auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); + auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) + + parallelSection.run([&](auto s) { + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s)) + pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer); + }); + } + } + } + } + communicationInProgress_[COARSE_TO_FINE][fineLevel] = false; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fineLevel) +{ + if (!communicationInProgress_[FINE_TO_COARSE][fineLevel] || packInfos_.empty()) return; + + WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0)) + + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) + // WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); + + if (sendFromGPU_) + { + // auto parallelSection = parallelSectionManager_.parallelSection( nullptr ); + for (auto recvInfo = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].begin(); + recvInfo != bufferSystemGPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo) + { + recvInfo.buffer().clear(); + for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); + + for (auto& pi : packInfos_) + { + GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + // parallelSection.run([&](auto s) { + pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer); + // }); + } + } + } + } + else + { + auto parallelSection = parallelSectionManager_.parallelSection(nullptr); + for (auto recvInfo = bufferSystemCPU_[FINE_TO_COARSE][fineLevel].begin(); + recvInfo != bufferSystemCPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo) + { + auto& gpuBuffer = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].sendBuffer(recvInfo.rank()); + + recvInfo.buffer().clear(); + gpuBuffer.clear(); + for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()]) + { + auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); + auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); + + for (auto& pi : packInfos_) + { + auto size = pi->sizeFineToCoarseSend(senderBlock, header.dir); + auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); + auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) + + parallelSection.run([&](auto s) { + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s)) + pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer); + }); + } + } + } + } + communicationInProgress_[FINE_TO_COARSE][fineLevel] = false; +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::setupCommunication() +{ + WALBERLA_ASSERT_GREATER(packInfos_.size(), uint_c(0), + "You have not registered a packInfo yet, thus setupCommunication does not work yet.") + auto forest = blockForest_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(forest, + "Trying to access communication for a block storage object that doesn't exist anymore") + const uint_t levels = forest->getNumberOfLevels(); + + std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > > + receiverInfo; // how many bytes to send to each neighbor + std::vector< std::vector< mpi::BufferSystem > > headerExchangeBs; + + receiverInfo.resize(3); + receiverInfo[EQUAL_LEVEL].resize(levels + uint_c(1)); + receiverInfo[COARSE_TO_FINE].resize(levels + uint_c(1)); + receiverInfo[FINE_TO_COARSE].resize(levels + uint_c(1)); + + std::vector< std::vector< mpi::MPISize > > localBufferSize; + + headerExchangeBs.resize(3); + localBufferSize.resize(3); + + for (uint_t j = 0; j <= levels; ++j) + { + headerExchangeBs[EQUAL_LEVEL].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); + headerExchangeBs[COARSE_TO_FINE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); + headerExchangeBs[FINE_TO_COARSE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); + + localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0)); + localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0)); + localBufferSize[FINE_TO_COARSE].push_back(mpi::MPISize(0)); + } + + for (auto& iBlock : *forest) + { + auto block = dynamic_cast< Block* >(&iBlock); + if (!selectable::isSetSelected(block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue; + + const BlockID& senderId = block->getId(); + auto nLevel = block->getLevel(); + + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + // skip if block has no neighbors in this direction + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + + if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx)) + { + WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + continue; + + const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0)); + auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0))); + + for (auto& pi : packInfos_) + { + receiverInfo[EQUAL_LEVEL][nLevel][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir)); + } + + auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][nLevel].sendBuffer(nProcess); + receiverId.toBuffer(headerBuffer); + senderId.toBuffer(headerBuffer); + headerBuffer << *dir; + } + else if (block->neighborhoodSectionHasSmallerBlocks(neighborIdx)) + { + auto fineLevel = nLevel + uint_c(1); // For indexing always the fineLevel is taken to be consistent. + WALBERLA_ASSERT_LESS(fineLevel, levels) + + for (uint_t n = 0; n != block->getNeighborhoodSectionSize(neighborIdx); ++n) + { + const BlockID& receiverId = block->getNeighborId(neighborIdx, n); + if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + if( block->neighborExistsLocally( neighborIdx, n ) ) + { + for (auto& pi : packInfos_) + localBufferSize[COARSE_TO_FINE][fineLevel] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir)); + continue; + } + + auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, n)); + for (auto& pi : packInfos_) + receiverInfo[COARSE_TO_FINE][fineLevel][nProcess] += + mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir)); + auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel].sendBuffer(nProcess); + receiverId.toBuffer(headerBuffer); + senderId.toBuffer(headerBuffer); + headerBuffer << *dir; + } + } + else if (block->neighborhoodSectionHasLargerBlock(neighborIdx)) + { + WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + + const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0)); + if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + + if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + { + for (auto& pi : packInfos_) + localBufferSize[FINE_TO_COARSE][nLevel] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); + continue; + } + + auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0))); + for (auto& pi : packInfos_) + receiverInfo[FINE_TO_COARSE][nLevel][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); + + auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][nLevel].sendBuffer(nProcess); + receiverId.toBuffer(headerBuffer); + senderId.toBuffer(headerBuffer); + headerBuffer << *dir; + } + } + } + + for (uint_t i = 0; i != 3; ++i) + { + for (uint_t j = 0; j <= levels; ++j) + { + headerExchangeBs[i][j].setReceiverInfoFromSendBufferState(false, true); + headerExchangeBs[i][j].sendAll(); + for (auto recvIter = headerExchangeBs[i][j].begin(); recvIter != headerExchangeBs[i][j].end(); ++recvIter) + { + auto& headerVector = headers_[i][j][recvIter.rank()]; + auto& buffer = recvIter.buffer(); + while (buffer.size()) + { + Header header; + header.receiverId.fromBuffer(buffer); + header.senderId.fromBuffer(buffer); + buffer >> header.dir; + headerVector.push_back(header); + } + } + + bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j]); + bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j]); + + for (auto it : receiverInfo[i][j]) + { + bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); + bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); + } + if (localBufferSize[i][j] > 0) + localBuffer_[i][j].resize(size_t(localBufferSize[i][j])); + } + } + + forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); +} + +template< typename Stencil > +bool NonUniformGPUScheme< Stencil >::isAnyCommunicationInProgress() const +{ + for (auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt) + for (auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt) + if (*levelIt) return true; + + return false; +} + +template< typename Stencil > +NonUniformGPUScheme< Stencil >::~NonUniformGPUScheme() +{ + for (uint_t i = 0; i != bufferSystemGPU_[EQUAL_LEVEL].size(); ++i) + { + waitCommunicateEqualLevel(i); + waitCommunicateCoarseToFine(i); + waitCommunicateFineToCoarse(i); + } +} + +template< typename Stencil > +void NonUniformGPUScheme< Stencil >::addPackInfo(const shared_ptr< GeneratedNonUniformGPUPackInfo >& pi) +{ + if (isAnyCommunicationInProgress()) + { + WALBERLA_ABORT("You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!") + } + packInfos_.push_back(pi); + setupCommunication(); +} + +} // namespace walberla::gpu::communication diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h index e53e6772b4ccd5a3e04da5a632cbafd8ceb206d3..5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5 100644 --- a/src/gpu/communication/UniformGPUScheme.h +++ b/src/gpu/communication/UniformGPUScheme.h @@ -32,7 +32,7 @@ #include <thread> -#include "gpu/CudaRAII.h" +#include "gpu/GPURAII.h" #include "gpu/GPUWrapper.h" #include "gpu/ParallelStreams.h" #include "gpu/communication/CustomMemoryBuffer.h" @@ -51,12 +51,14 @@ namespace communication { public: explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, bool sendDirectlyFromGPU = false, + bool useLocalCommunication = true, const int tag = 5432 ); explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, bool sendDirectlyFromGPU = false, + bool useLocalCommunication = true, const int tag = 5432 ); void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); @@ -71,7 +73,6 @@ namespace communication { std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr ); std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr ); - private: void setupCommunication(); @@ -81,6 +82,7 @@ namespace communication { bool setupBeforeNextCommunication_; bool communicationInProgress_; bool sendFromGPU_; + bool useLocalCommunication_; using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer; using GpuBuffer_T = gpu::communication::GPUMemoryBuffer; diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h index c8e81cb23e14e15e7fd79b5ec7fb052137b600a9..a12017cf77eca51af31bd967df2af914ae7f28a1 100644 --- a/src/gpu/communication/UniformGPUScheme.impl.h +++ b/src/gpu/communication/UniformGPUScheme.impl.h @@ -30,11 +30,13 @@ namespace communication { template<typename Stencil> UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, bool sendDirectlyFromGPU, + bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), communicationInProgress_( false ), sendFromGPU_( sendDirectlyFromGPU ), + useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), parallelSectionManager_( -1 ), @@ -47,11 +49,13 @@ namespace communication { const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, bool sendDirectlyFromGPU, + bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), communicationInProgress_( false ), sendFromGPU_( sendDirectlyFromGPU ), + useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), parallelSectionManager_( -1 ), @@ -86,28 +90,40 @@ namespace communication { auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto &iBlock : *forest ) { - auto block = dynamic_cast< Block * >( &iBlock ); + auto senderBlock = dynamic_cast< Block * >( &iBlock ); - if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + if( !selectable::isSetSelected( senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); - if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) + + if( senderBlock->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) continue; - auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); - if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + if( !selectable::isSetSelected( senderBlock->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; - for( auto &pi : packInfos_ ) + if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) && useLocalCommunication_ ) { - parallelSection.run([&](auto s) { - auto size = pi->size( *dir, block ); + auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); + for (auto& pi : packInfos_) + { + pi->communicateLocal(*dir, senderBlock, receiverBlock, stream); + } + } + else + { + auto nProcess = mpi::MPIRank( senderBlock->getNeighborProcess( neighborIdx, uint_t( 0 ))); + + for( auto &pi : packInfos_ ) + { + parallelSection.run([&](auto s) { + auto size = pi->size( *dir, senderBlock ); auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - pi->pack( *dir, gpuDataPtr, block, s ); + pi->pack( *dir, gpuDataPtr, senderBlock, s ); if( !sendFromGPU_ ) { @@ -115,12 +131,12 @@ namespace communication { WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s )) } - }); + }); + } } } } } - // wait for packing to finish WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) ); @@ -181,7 +197,6 @@ namespace communication { auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s )) @@ -192,6 +207,7 @@ namespace communication { } } + WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) communicationInProgress_ = false; } @@ -216,6 +232,7 @@ namespace communication { for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { // skip if block has no neighbors in this direction const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); + if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) continue; @@ -229,6 +246,9 @@ namespace communication { if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; + if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) && useLocalCommunication_ ) + continue; + auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); for( auto &pi : packInfos_ ) @@ -287,7 +307,7 @@ namespace communication { } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(cudaStream_t stream) { return [this, stream]() { wait( stream ); }; } diff --git a/src/lbm_generated/CMakeLists.txt b/src/lbm_generated/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2513a58f2e646025fa86107409058a7576a3f62f --- /dev/null +++ b/src/lbm_generated/CMakeLists.txt @@ -0,0 +1,25 @@ +add_library( lbm_generated) + +target_link_libraries( lbm_generated + PUBLIC + blockforest + boundary + communication + core + domain_decomposition + field + geometry + gui + stencil + timeloop + vtk + ) + +add_subdirectory( boundary ) +add_subdirectory( communication ) +add_subdirectory( gpu ) +add_subdirectory( evaluation ) +add_subdirectory( field ) +add_subdirectory( refinement ) +add_subdirectory( storage_specification ) +add_subdirectory( sweep_collection ) \ No newline at end of file diff --git a/src/lbm_generated/boundary/CMakeLists.txt b/src/lbm_generated/boundary/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..201337a88fa1547002e0266b8837f369cf893b59 --- /dev/null +++ b/src/lbm_generated/boundary/CMakeLists.txt @@ -0,0 +1,25 @@ +target_sources( lbm_generated + PRIVATE + D3Q19BoundaryCollection.h + D3Q27BoundaryCollection.h + FreeSlipD3Q19.h + FreeSlipD3Q19.cpp + FreeSlipD3Q27.h + FreeSlipD3Q27.cpp + FixedDensityD3Q19.h + FixedDensityD3Q19.cpp + FixedDensityD3Q27.h + FixedDensityD3Q27.cpp + NoSlipD3Q19.h + NoSlipD3Q19.cpp + NoSlipD3Q27.h + NoSlipD3Q27.cpp + OutflowD3Q19.h + OutflowD3Q19.cpp + OutflowD3Q27.h + OutflowD3Q27.cpp + UBBD3Q19.h + UBBD3Q19.cpp + UBBD3Q27.h + UBBD3Q27.cpp + ) diff --git a/src/lbm_generated/boundary/D3Q19BoundaryCollection.h b/src/lbm_generated/boundary/D3Q19BoundaryCollection.h new file mode 100644 index 0000000000000000000000000000000000000000..eb1a23fb52be36ec0471bf05989512724acdc477 --- /dev/null +++ b/src/lbm_generated/boundary/D3Q19BoundaryCollection.h @@ -0,0 +1,123 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q19BoundaryCollection.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "domain_decomposition/IBlock.h" + +#include "OutflowD3Q19.h" +#include "FixedDensityD3Q19.h" +#include "FreeSlipD3Q19.h" +#include "NoSlipD3Q19.h" +#include "UBBD3Q19.h" + + + +namespace walberla{ +namespace lbm { + +template <typename FlagField_T> +class D3Q19BoundaryCollection +{ + public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + + D3Q19BoundaryCollection(const shared_ptr<StructuredBlockForest> & blocks, BlockDataID flagID_, BlockDataID pdfsID_, FlagUID domainUID_, double density, double u_x, double u_y, double u_z) + : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_) + { + OutflowD3Q19Object = std::make_shared< lbm::OutflowD3Q19 >(blocks, pdfsID); + FixedDensityD3Q19Object = std::make_shared< lbm::FixedDensityD3Q19 >(blocks, pdfsID, density); + FreeSlipD3Q19Object = std::make_shared< lbm::FreeSlipD3Q19 >(blocks, pdfsID); + NoSlipD3Q19Object = std::make_shared< lbm::NoSlipD3Q19 >(blocks, pdfsID); + UBBD3Q19Object = std::make_shared< lbm::UBBD3Q19 >(blocks, pdfsID, u_x, u_y, u_z); + + + OutflowD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("Outflow"), domainUID); + FixedDensityD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FixedDensity"), domainUID); + FreeSlipD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FreeSlip"), domainUID); + NoSlipD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("NoSlip"), domainUID); + UBBD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("UBB"), domainUID); + + } + + void run (IBlock * block) + { + OutflowD3Q19Object->run(block); + FixedDensityD3Q19Object->run(block); + FreeSlipD3Q19Object->run(block); + NoSlipD3Q19Object->run(block); + UBBD3Q19Object->run(block); + + } + + void inner (IBlock * block) + { + OutflowD3Q19Object->inner(block); + FixedDensityD3Q19Object->inner(block); + FreeSlipD3Q19Object->inner(block); + NoSlipD3Q19Object->inner(block); + UBBD3Q19Object->inner(block); + + } + + void outer (IBlock * block) + { + OutflowD3Q19Object->outer(block); + FixedDensityD3Q19Object->outer(block); + FreeSlipD3Q19Object->outer(block); + NoSlipD3Q19Object->outer(block); + UBBD3Q19Object->outer(block); + + } + + void operator() (IBlock * block) + { + run(block); + } + + std::function<void (IBlock *)> getSweep(Type type = Type::ALL) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { this->inner(block); }; + case Type::OUTER: + return [this](IBlock* block) { this->outer(block); }; + default: + return [this](IBlock* block) { this->run(block); }; + } + } + + weak_ptr< StructuredBlockStorage > blocks_; + BlockDataID flagID; + BlockDataID pdfsID; + walberla::FlagUID domainUID; + + shared_ptr<lbm::OutflowD3Q19> OutflowD3Q19Object; + shared_ptr<lbm::FixedDensityD3Q19> FixedDensityD3Q19Object; + shared_ptr<lbm::FreeSlipD3Q19> FreeSlipD3Q19Object; + shared_ptr<lbm::NoSlipD3Q19> NoSlipD3Q19Object; + shared_ptr<lbm::UBBD3Q19> UBBD3Q19Object; + +}; + +} +} diff --git a/src/lbm_generated/boundary/D3Q27BoundaryCollection.h b/src/lbm_generated/boundary/D3Q27BoundaryCollection.h new file mode 100644 index 0000000000000000000000000000000000000000..3428689bda22764cf3552e641d4c1f2656bab37a --- /dev/null +++ b/src/lbm_generated/boundary/D3Q27BoundaryCollection.h @@ -0,0 +1,123 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q27BoundaryCollection.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "domain_decomposition/IBlock.h" + +#include "OutflowD3Q27.h" +#include "FixedDensityD3Q27.h" +#include "FreeSlipD3Q27.h" +#include "NoSlipD3Q27.h" +#include "UBBD3Q27.h" + + + +namespace walberla{ +namespace lbm { + +template <typename FlagField_T> +class D3Q27BoundaryCollection +{ + public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + + D3Q27BoundaryCollection(const shared_ptr<StructuredBlockForest> & blocks, BlockDataID flagID_, BlockDataID pdfsID_, FlagUID domainUID_, double density, double u_x, double u_y, double u_z) + : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_) + { + OutflowD3Q27Object = std::make_shared< lbm::OutflowD3Q27 >(blocks, pdfsID); + FixedDensityD3Q27Object = std::make_shared< lbm::FixedDensityD3Q27 >(blocks, pdfsID, density); + FreeSlipD3Q27Object = std::make_shared< lbm::FreeSlipD3Q27 >(blocks, pdfsID); + NoSlipD3Q27Object = std::make_shared< lbm::NoSlipD3Q27 >(blocks, pdfsID); + UBBD3Q27Object = std::make_shared< lbm::UBBD3Q27 >(blocks, pdfsID, u_x, u_y, u_z); + + + OutflowD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("Outflow"), domainUID); + FixedDensityD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FixedDensity"), domainUID); + FreeSlipD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FreeSlip"), domainUID); + NoSlipD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("NoSlip"), domainUID); + UBBD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("UBB"), domainUID); + + } + + void run (IBlock * block) + { + OutflowD3Q27Object->run(block); + FixedDensityD3Q27Object->run(block); + FreeSlipD3Q27Object->run(block); + NoSlipD3Q27Object->run(block); + UBBD3Q27Object->run(block); + + } + + void inner (IBlock * block) + { + OutflowD3Q27Object->inner(block); + FixedDensityD3Q27Object->inner(block); + FreeSlipD3Q27Object->inner(block); + NoSlipD3Q27Object->inner(block); + UBBD3Q27Object->inner(block); + + } + + void outer (IBlock * block) + { + OutflowD3Q27Object->outer(block); + FixedDensityD3Q27Object->outer(block); + FreeSlipD3Q27Object->outer(block); + NoSlipD3Q27Object->outer(block); + UBBD3Q27Object->outer(block); + + } + + void operator() (IBlock * block) + { + run(block); + } + + std::function<void (IBlock *)> getSweep(Type type = Type::ALL) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { this->inner(block); }; + case Type::OUTER: + return [this](IBlock* block) { this->outer(block); }; + default: + return [this](IBlock* block) { this->run(block); }; + } + } + + weak_ptr< StructuredBlockStorage > blocks_; + BlockDataID flagID; + BlockDataID pdfsID; + walberla::FlagUID domainUID; + + shared_ptr<lbm::OutflowD3Q27> OutflowD3Q27Object; + shared_ptr<lbm::FixedDensityD3Q27> FixedDensityD3Q27Object; + shared_ptr<lbm::FreeSlipD3Q27> FreeSlipD3Q27Object; + shared_ptr<lbm::NoSlipD3Q27> NoSlipD3Q27Object; + shared_ptr<lbm::UBBD3Q27> UBBD3Q27Object; + +}; + +} +} diff --git a/src/lbm_generated/boundary/FixedDensityD3Q19.cpp b/src/lbm_generated/boundary/FixedDensityD3Q19.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e449704f5a0bfa4932344fef2a8cab378770592f --- /dev/null +++ b/src/lbm_generated/boundary/FixedDensityD3Q19.cpp @@ -0,0 +1,141 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FixedDensityD3Q19.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "FixedDensityD3Q19.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_fixeddensityd3q19_even { +static FUNC_PREFIX void fixeddensityd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double density, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + const double rho = density; + const double delta_rho = rho - 1.0; + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + const double vel0Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3]; + const double vel1Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3]; + const double vel2Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 5*_stride_pdfs_3]; + const double u_0 = vel0Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 3*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3]; + const double u_1 = vel1Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 2*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3]; + const double u_2 = vel2Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3]; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = -1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir] + 2.0*((((dir) == (0))) ? (f_eq_common*0.33333333333333331): ((((dir) == (1)) || ((dir) == (2))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_1*u_1)): ((((dir) == (3)) || ((dir) == (4))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_0*u_0)): ((((dir) == (5)) || ((dir) == (6))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_2*u_2)): ((((dir) == (7))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)): ((((dir) == (8)) || ((dir) == (9))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)): ((((dir) == (10))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)): ((((dir) == (11))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)): ((((dir) == (12))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)): ((((dir) == (13))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)): ((((dir) == (14))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)): ((((dir) == (15))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)): ((((dir) == (16))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)): ((((dir) == (17))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)): ((((dir) == (18))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)): (0.0)))))))))))))))); + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void FixedDensityD3Q19::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + auto & density = density_; + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_fixeddensityd3q19_even::fixeddensityd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize); + } else { + internal_fixeddensityd3q19_even::fixeddensityd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize); + } +} + +void FixedDensityD3Q19::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void FixedDensityD3Q19::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void FixedDensityD3Q19::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/FixedDensityD3Q19.h b/src/lbm_generated/boundary/FixedDensityD3Q19.h new file mode 100644 index 0000000000000000000000000000000000000000..b4575d189724633c503fc0ba94a004c5b07ef9c2 --- /dev/null +++ b/src/lbm_generated/boundary/FixedDensityD3Q19.h @@ -0,0 +1,509 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FixedDensityD3Q19.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class FixedDensityD3Q19 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + FixedDensityD3Q19( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_, double density) + : pdfsID(pdfsID_), density_(density) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FixedDensityD3Q19"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; + double density_; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/FixedDensityD3Q27.cpp b/src/lbm_generated/boundary/FixedDensityD3Q27.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3ff43bc5efa34a0ba88e8205440f46e5fa6db94b --- /dev/null +++ b/src/lbm_generated/boundary/FixedDensityD3Q27.cpp @@ -0,0 +1,140 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FixedDensityD3Q27.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "FixedDensityD3Q27.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_fixeddensityd3q27_even { +static FUNC_PREFIX void fixeddensityd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double density, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + const double rho = density; + const double delta_rho = rho - 1.0; + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + const double vel0Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3]; + const double vel1Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3]; + const double vel2Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 5*_stride_pdfs_3]; + const double u_0 = vel0Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 3*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3]; + const double u_1 = vel1Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 2*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3]; + const double u_2 = vel2Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3]; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = -1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir] + 2.0*((((dir) == (0))) ? (f_eq_common*0.29629629629629628): ((((dir) == (1)) || ((dir) == (2))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_1*u_1)): ((((dir) == (3)) || ((dir) == (4))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_0*u_0)): ((((dir) == (5)) || ((dir) == (6))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_2*u_2)): ((((dir) == (7))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu1*u0Mu1)): ((((dir) == (8)) || ((dir) == (9))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu1*u0Pu1)): ((((dir) == (10))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu1*u0Mu1)): ((((dir) == (11))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Pu2*u1Pu2)): ((((dir) == (12))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Mu2*u1Mu2)): ((((dir) == (13))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu2*u0Mu2)): ((((dir) == (14))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu2*u0Pu2)): ((((dir) == (15))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Mu2*u1Mu2)): ((((dir) == (16))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Pu2*u1Pu2)): ((((dir) == (17))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu2*u0Pu2)): ((((dir) == (18))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu2*u0Mu2)): ((((dir) == (19))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (20))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (21))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (22)) || ((dir) == (23))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (24))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (25))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (26))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): (0.0))))))))))))))))))))))); + } +} +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void FixedDensityD3Q27::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + auto & density = density_; + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_fixeddensityd3q27_even::fixeddensityd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize); + } else { + internal_fixeddensityd3q27_even::fixeddensityd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize); + } +} + +void FixedDensityD3Q27::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void FixedDensityD3Q27::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void FixedDensityD3Q27::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/FixedDensityD3Q27.h b/src/lbm_generated/boundary/FixedDensityD3Q27.h new file mode 100644 index 0000000000000000000000000000000000000000..359540d25af2be0c78b85ad591c27aaba8d48de8 --- /dev/null +++ b/src/lbm_generated/boundary/FixedDensityD3Q27.h @@ -0,0 +1,645 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FixedDensityD3Q27.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class FixedDensityD3Q27 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + FixedDensityD3Q27( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_, double density) + : pdfsID(pdfsID_), density_(density) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FixedDensityD3Q27"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 19 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 20 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 21 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 22 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 23 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 24 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 25 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 26 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; + double density_; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/FreeSlipD3Q19.cpp b/src/lbm_generated/boundary/FreeSlipD3Q19.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e3dc46580b5cbd0bdf533dd33742986ab13cd7f --- /dev/null +++ b/src/lbm_generated/boundary/FreeSlipD3Q19.cpp @@ -0,0 +1,132 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FreeSlipD3Q19.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "FreeSlipD3Q19.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_freeslipd3q19_even { +static FUNC_PREFIX void freeslipd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 16])) + neighbour_offset_x[dir]) + _stride_pdfs_1*y + _stride_pdfs_1*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 20])) + neighbour_offset_y[dir]) + _stride_pdfs_2*z + _stride_pdfs_2*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 24])) + neighbour_offset_z[dir]) + _stride_pdfs_3**((int32_t * )(& _data_indexVector[32*ctr_0 + 28]))]; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void FreeSlipD3Q19::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_freeslipd3q19_even::freeslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_freeslipd3q19_even::freeslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void FreeSlipD3Q19::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void FreeSlipD3Q19::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void FreeSlipD3Q19::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/FreeSlipD3Q19.h b/src/lbm_generated/boundary/FreeSlipD3Q19.h new file mode 100644 index 0000000000000000000000000000000000000000..4679ffc4ff0cbf7cc5bfb07d1a9f9d9a7e775e2e --- /dev/null +++ b/src/lbm_generated/boundary/FreeSlipD3Q19.h @@ -0,0 +1,1101 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FreeSlipD3Q19.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class FreeSlipD3Q19 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + int32_t wnx; + int32_t wny; + int32_t wnz; + int32_t ref_dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), wnx(), wny(), wnz(), ref_dir() {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir && wnx == o.wnx && wny == o.wny && wnz == o.wnz && ref_dir == o.ref_dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + FreeSlipD3Q19( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FreeSlipD3Q19"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 0, 0); + int32_t ref_dir = 0; // dir: 0 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = 0; + ref_dir = 0; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 1, 0); + int32_t ref_dir = 2; // dir: 1 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = 0; + ref_dir = 1; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, -1, 0); + int32_t ref_dir = 1; // dir: 2 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = 0; + ref_dir = 2; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(-1, 0, 0); + int32_t ref_dir = 4; // dir: 3 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = 0; + ref_dir = 3; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(1, 0, 0); + int32_t ref_dir = 3; // dir: 4 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = 0; + ref_dir = 4; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 0, 1); + int32_t ref_dir = 6; // dir: 5 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = -1; + ref_dir = 5; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 0, -1); + int32_t ref_dir = 5; // dir: 6 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = 1; + ref_dir = 6; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(-1, 1, 0); + int32_t ref_dir = 10; // dir: 7 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = -1; + element.wnz = 0; + ref_dir = 7; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(1, 1, 0); + int32_t ref_dir = 9; // dir: 8 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = -1; + element.wnz = 0; + ref_dir = 8; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(-1, -1, 0); + int32_t ref_dir = 8; // dir: 9 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 1; + element.wnz = 0; + ref_dir = 9; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(1, -1, 0); + int32_t ref_dir = 7; // dir: 10 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 1; + element.wnz = 0; + ref_dir = 10; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 1, 1); + int32_t ref_dir = 16; // dir: 11 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = -1; + ref_dir = 11; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, -1, 1); + int32_t ref_dir = 15; // dir: 12 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = -1; + ref_dir = 12; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(-1, 0, 1); + int32_t ref_dir = 18; // dir: 13 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = -1; + ref_dir = 13; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(1, 0, 1); + int32_t ref_dir = 17; // dir: 14 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = -1; + ref_dir = 14; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, 1, -1); + int32_t ref_dir = 12; // dir: 15 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = 1; + ref_dir = 15; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(0, -1, -1); + int32_t ref_dir = 11; // dir: 16 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = 1; + ref_dir = 16; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(-1, 0, -1); + int32_t ref_dir = 14; // dir: 17 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = 1; + ref_dir = 17; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 }; + const Cell n = it.cell() + Cell(1, 0, -1); + int32_t ref_dir = 13; // dir: 18 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = 1; + ref_dir = 18; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/FreeSlipD3Q27.cpp b/src/lbm_generated/boundary/FreeSlipD3Q27.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3364610eec662874c88832e7ebedd144755ccf1a --- /dev/null +++ b/src/lbm_generated/boundary/FreeSlipD3Q27.cpp @@ -0,0 +1,132 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FreeSlipD3Q27.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "FreeSlipD3Q27.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_freeslipd3q27_even { +static FUNC_PREFIX void freeslipd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 16])) + neighbour_offset_x[dir]) + _stride_pdfs_1*y + _stride_pdfs_1*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 20])) + neighbour_offset_y[dir]) + _stride_pdfs_2*z + _stride_pdfs_2*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 24])) + neighbour_offset_z[dir]) + _stride_pdfs_3**((int32_t * )(& _data_indexVector[32*ctr_0 + 28]))]; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void FreeSlipD3Q27::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_freeslipd3q27_even::freeslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_freeslipd3q27_even::freeslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void FreeSlipD3Q27::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void FreeSlipD3Q27::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void FreeSlipD3Q27::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/FreeSlipD3Q27.h b/src/lbm_generated/boundary/FreeSlipD3Q27.h new file mode 100644 index 0000000000000000000000000000000000000000..562dfbcadd6e98f88ece133ab724080f3488b77e --- /dev/null +++ b/src/lbm_generated/boundary/FreeSlipD3Q27.h @@ -0,0 +1,1485 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file FreeSlipD3Q27.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class FreeSlipD3Q27 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + int32_t wnx; + int32_t wny; + int32_t wnz; + int32_t ref_dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), wnx(), wny(), wnz(), ref_dir() {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir && wnx == o.wnx && wny == o.wny && wnz == o.wnz && ref_dir == o.ref_dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + FreeSlipD3Q27( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FreeSlipD3Q27"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 0, 0); + int32_t ref_dir = 0; // dir: 0 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = 0; + ref_dir = 0; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 1, 0); + int32_t ref_dir = 2; // dir: 1 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = 0; + ref_dir = 1; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, -1, 0); + int32_t ref_dir = 1; // dir: 2 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = 0; + ref_dir = 2; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 0, 0); + int32_t ref_dir = 4; // dir: 3 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = 0; + ref_dir = 3; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 0, 0); + int32_t ref_dir = 3; // dir: 4 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = 0; + ref_dir = 4; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 0, 1); + int32_t ref_dir = 6; // dir: 5 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = -1; + ref_dir = 5; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 0, -1); + int32_t ref_dir = 5; // dir: 6 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 0; + element.wnz = 1; + ref_dir = 6; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 1, 0); + int32_t ref_dir = 10; // dir: 7 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = -1; + element.wnz = 0; + ref_dir = 7; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 1, 0); + int32_t ref_dir = 9; // dir: 8 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = -1; + element.wnz = 0; + ref_dir = 8; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, -1, 0); + int32_t ref_dir = 8; // dir: 9 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 1; + element.wnz = 0; + ref_dir = 9; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, -1, 0); + int32_t ref_dir = 7; // dir: 10 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) ) + { + element.wnz = 0; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 1; + element.wnz = 0; + ref_dir = 10; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 1, 1); + int32_t ref_dir = 16; // dir: 11 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = -1; + ref_dir = 11; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, -1, 1); + int32_t ref_dir = 15; // dir: 12 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = -1; + ref_dir = 12; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 0, 1); + int32_t ref_dir = 18; // dir: 13 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = -1; + ref_dir = 13; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 0, 1); + int32_t ref_dir = 17; // dir: 14 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = -1; + ref_dir = 14; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, 1, -1); + int32_t ref_dir = 12; // dir: 15 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = -1; + element.wnz = 1; + ref_dir = 15; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(0, -1, -1); + int32_t ref_dir = 11; // dir: 16 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 0; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 0; + element.wny = 1; + element.wnz = 1; + ref_dir = 16; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 0, -1); + int32_t ref_dir = 14; // dir: 17 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 0; + element.wnz = 1; + ref_dir = 17; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 0, -1); + int32_t ref_dir = 13; // dir: 18 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) ) + { + element.wny = 0; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 0; + element.wnz = 1; + ref_dir = 18; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 19 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 1, 1); + int32_t ref_dir = 26; // dir: 19 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = -1; + element.wnz = -1; + ref_dir = 19; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 20 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 1, 1); + int32_t ref_dir = 25; // dir: 20 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = -1; + element.wnz = -1; + ref_dir = 20; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 21 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, -1, 1); + int32_t ref_dir = 24; // dir: 21 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 1; + element.wnz = -1; + ref_dir = 21; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 22 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, -1, 1); + int32_t ref_dir = 23; // dir: 22 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) ) + { + element.wnz = -1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 1; + element.wnz = -1; + ref_dir = 22; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 23 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, 1, -1); + int32_t ref_dir = 22; // dir: 23 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = -1; + element.wnz = 1; + ref_dir = 23; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 24 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, 1, -1); + int32_t ref_dir = 21; // dir: 24 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) ) + { + element.wny = -1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = -1; + element.wnz = 1; + ref_dir = 24; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 25 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(1, -1, -1); + int32_t ref_dir = 20; // dir: 25 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = -1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = -1; + element.wny = 1; + element.wnz = 1; + ref_dir = 25; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 26 ); + const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 }; + const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 }; + const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 }; + const Cell n = it.cell() + Cell(-1, -1, -1); + int32_t ref_dir = 19; // dir: 26 + element.wnx = 0; // compute discrete normal vector of free slip wall + element.wny = 0; + if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) ) + { + element.wnx = 1; + ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ]; + } + if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) ) + { + element.wny = 1; + ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ]; + } + element.wnz = 0; + if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) ) + { + element.wnz = 1; + ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ]; + } + // concave corner (neighbors are non-fluid) + if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 ) + { + element.wnx = 1; + element.wny = 1; + element.wnz = 1; + ref_dir = 26; + } + element.ref_dir = ref_dir; + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/NoSlipD3Q19.cpp b/src/lbm_generated/boundary/NoSlipD3Q19.cpp new file mode 100644 index 0000000000000000000000000000000000000000..268cbf43361645c8e7886f6abd86a56089a75fff --- /dev/null +++ b/src/lbm_generated/boundary/NoSlipD3Q19.cpp @@ -0,0 +1,125 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file NoSlipD3Q19.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "NoSlipD3Q19.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_noslipd3q19_even { +static FUNC_PREFIX void noslipd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir]; + } +} +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void NoSlipD3Q19::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_noslipd3q19_even::noslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_noslipd3q19_even::noslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void NoSlipD3Q19::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void NoSlipD3Q19::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void NoSlipD3Q19::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/NoSlipD3Q19.h b/src/lbm_generated/boundary/NoSlipD3Q19.h new file mode 100644 index 0000000000000000000000000000000000000000..933108eec5fdcdeee8e0af6abb90617fc149307e --- /dev/null +++ b/src/lbm_generated/boundary/NoSlipD3Q19.h @@ -0,0 +1,508 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file NoSlipD3Q19.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class NoSlipD3Q19 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + NoSlipD3Q19( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_NoSlipD3Q19"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/NoSlipD3Q27.cpp b/src/lbm_generated/boundary/NoSlipD3Q27.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c38bee8122daa4ee1d09b1b861e5729d232bf310 --- /dev/null +++ b/src/lbm_generated/boundary/NoSlipD3Q27.cpp @@ -0,0 +1,126 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file NoSlipD3Q27.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "NoSlipD3Q27.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_noslipd3q27_even { +static FUNC_PREFIX void noslipd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir]; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void NoSlipD3Q27::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_noslipd3q27_even::noslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_noslipd3q27_even::noslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void NoSlipD3Q27::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void NoSlipD3Q27::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void NoSlipD3Q27::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/NoSlipD3Q27.h b/src/lbm_generated/boundary/NoSlipD3Q27.h new file mode 100644 index 0000000000000000000000000000000000000000..56bbfb0611d6a506b3ed4558c388b3d9ed65d443 --- /dev/null +++ b/src/lbm_generated/boundary/NoSlipD3Q27.h @@ -0,0 +1,644 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file NoSlipD3Q27.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class NoSlipD3Q27 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + NoSlipD3Q27( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_NoSlipD3Q27"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 19 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 20 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 21 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 22 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 23 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 24 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 25 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 26 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/OutflowD3Q19.cpp b/src/lbm_generated/boundary/OutflowD3Q19.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d42cf90429601d5ed4809c30b8926548d8bf6618 --- /dev/null +++ b/src/lbm_generated/boundary/OutflowD3Q19.cpp @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file OutflowD3Q19.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "OutflowD3Q19.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_outflowd3q19_even { +static FUNC_PREFIX void outflowd3q19_even(const uint8_t * RESTRICT _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_out_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12])); + const double pdf_inter = 0.42264973081037427**((double * )(& _data_indexVector[32*ctr_0 + 24])) + 0.57735026918962573**((double * )(& _data_indexVector[32*ctr_0 + 16])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = pdf_inter; + *((double * )(& _data_indexVector[32*ctr_0 + 16])) = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(neighbour_offset_x[dir] - 1) + _stride_pdfs_1*y + _stride_pdfs_1*neighbour_offset_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*neighbour_offset_z[dir] + _stride_pdfs_3*f_out_inv_dir_idx[dir]]; + *((double * )(& _data_indexVector[32*ctr_0 + 24])) = pdf_inter; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void OutflowD3Q19::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_outflowd3q19_even::outflowd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_outflowd3q19_even::outflowd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void OutflowD3Q19::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void OutflowD3Q19::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void OutflowD3Q19::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/OutflowD3Q19.h b/src/lbm_generated/boundary/OutflowD3Q19.h new file mode 100644 index 0000000000000000000000000000000000000000..bb2999966556997e70c9f469e65062951276a601 --- /dev/null +++ b/src/lbm_generated/boundary/OutflowD3Q19.h @@ -0,0 +1,277 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file OutflowD3Q19.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class OutflowD3Q19 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + double pdf; + double pdf_nd; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), pdf(), pdf_nd() {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir && floatIsEqual(pdf, o.pdf) && floatIsEqual(pdf_nd, o.pdf_nd); + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + OutflowD3Q19( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_OutflowD3Q19"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + auto pdfs = block->getData< field::GhostLayerField<real_t, 19> >(pdfsID); + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + BlockDataID pdfsCPUID; +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/OutflowD3Q27.cpp b/src/lbm_generated/boundary/OutflowD3Q27.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ec9a490b443740ff1ae24adfa1a1739261311a2 --- /dev/null +++ b/src/lbm_generated/boundary/OutflowD3Q27.cpp @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file OutflowD3Q27.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "OutflowD3Q27.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_outflowd3q27_even { +static FUNC_PREFIX void outflowd3q27_even(const uint8_t * RESTRICT _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize) +{ + + const int32_t f_out_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12])); + const double pdf_inter = 0.42264973081037427**((double * )(& _data_indexVector[32*ctr_0 + 24])) + 0.57735026918962573**((double * )(& _data_indexVector[32*ctr_0 + 16])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = pdf_inter; + *((double * )(& _data_indexVector[32*ctr_0 + 16])) = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(neighbour_offset_x[dir] - 1) + _stride_pdfs_1*y + _stride_pdfs_1*neighbour_offset_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*neighbour_offset_z[dir] + _stride_pdfs_3*f_out_inv_dir_idx[dir]]; + *((double * )(& _data_indexVector[32*ctr_0 + 24])) = pdf_inter; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void OutflowD3Q27::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_outflowd3q27_even::outflowd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } else { + internal_outflowd3q27_even::outflowd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize); + } +} + +void OutflowD3Q27::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void OutflowD3Q27::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void OutflowD3Q27::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/OutflowD3Q27.h b/src/lbm_generated/boundary/OutflowD3Q27.h new file mode 100644 index 0000000000000000000000000000000000000000..53b4e4bae5e6c6da6b4b108751120bf90a5ab25b --- /dev/null +++ b/src/lbm_generated/boundary/OutflowD3Q27.h @@ -0,0 +1,349 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file OutflowD3Q27.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class OutflowD3Q27 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + double pdf; + double pdf_nd; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), pdf(), pdf_nd() {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir && floatIsEqual(pdf, o.pdf) && floatIsEqual(pdf_nd, o.pdf_nd); + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + OutflowD3Q27( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_) + : pdfsID(pdfsID_) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_OutflowD3Q27"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + auto pdfs = block->getData< field::GhostLayerField<real_t, 27> >(pdfsID); + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 19 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(1), 26); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(1), 26); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 21 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(1), 24); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(1), 24); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 23 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(-1), 22); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(-1), 22); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 25 ); + element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1), 20); + element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1), 20); + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + BlockDataID pdfsCPUID; +public: + BlockDataID pdfsID; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/UBBD3Q19.cpp b/src/lbm_generated/boundary/UBBD3Q19.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a88d2feeff0237881df80f6494a4f58f8936e02 --- /dev/null +++ b/src/lbm_generated/boundary/UBBD3Q19.cpp @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UBBD3Q19.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "UBBD3Q19.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_ubbd3q19_even { +static FUNC_PREFIX void ubbd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize, double u_x, double u_y, double u_z) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + + const double weights [] = {0.33333333333333333, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778}; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = (u_x*6.0*((double)(neighbour_offset_x[dir])) + u_y*6.0*((double)(neighbour_offset_y[dir])) + u_z*6.0*((double)(neighbour_offset_z[dir])))*-1.0*weights[dir] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir]; + } +} +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void UBBD3Q19::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + auto & u_y = u_y_; + auto & u_x = u_x_; + auto & u_z = u_z_; + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_ubbd3q19_even::ubbd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z); + } else { + internal_ubbd3q19_even::ubbd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z); + } +} + +void UBBD3Q19::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void UBBD3Q19::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void UBBD3Q19::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/UBBD3Q19.h b/src/lbm_generated/boundary/UBBD3Q19.h new file mode 100644 index 0000000000000000000000000000000000000000..f57bac12d404b9b3d8819d7955dc65c3cdbcab61 --- /dev/null +++ b/src/lbm_generated/boundary/UBBD3Q19.h @@ -0,0 +1,511 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UBBD3Q19.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class UBBD3Q19 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + UBBD3Q19( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_, double u_x, double u_y, double u_z) + : pdfsID(pdfsID_), u_x_(u_x), u_y_(u_y), u_z_(u_z) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UBBD3Q19"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; + double u_x_; + double u_y_; + double u_z_; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/UBBD3Q27.cpp b/src/lbm_generated/boundary/UBBD3Q27.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08ee3ef38ef4460b590216b789caea5457da8b97 --- /dev/null +++ b/src/lbm_generated/boundary/UBBD3Q27.cpp @@ -0,0 +1,137 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UBBD3Q27.cpp +//! \\author pystencils +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/Macros.h" +#include "UBBD3Q27.h" + + + +#define FUNC_PREFIX + +using namespace std; + +namespace walberla { +namespace lbm { + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifdef __CUDACC__ +#pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress 177 +#else +#pragma diag_suppress 177 +#endif +#endif + +namespace internal_ubbd3q27_even { +static FUNC_PREFIX void ubbd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize, double u_x, double u_y, double u_z) +{ + + const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; + const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + + const double weights [] = {0.29629629629629630, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296}; + + + + const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; + const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; + const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; + + for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1) + { + const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0])); + const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4])); + const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8])); + const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12])); + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = (u_x*6.0*((double)(neighbour_offset_x[dir])) + u_y*6.0*((double)(neighbour_offset_y[dir])) + u_z*6.0*((double)(neighbour_offset_z[dir])))*-1.0*weights[dir] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir]; + } +} +} + + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#ifdef __CUDACC__ +#pragma pop +#endif + + +void UBBD3Q27::run_impl(IBlock * block, IndexVectors::Type type) +{ + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + + auto pointer = indexVectors->pointerCpu(type); + + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + uint8_t timestep = pdfs->getTimestep(); + auto & u_y = u_y_; + auto & u_x = u_x_; + auto & u_z = u_z_; + WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(0, 0, 0, 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + if(((timestep & 1) ^ 1)) { + internal_ubbd3q27_even::ubbd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z); + } else { + internal_ubbd3q27_even::ubbd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z); + } +} + +void UBBD3Q27::run(IBlock * block) +{ + run_impl(block, IndexVectors::ALL); +} + +void UBBD3Q27::inner(IBlock * block) +{ + run_impl(block, IndexVectors::INNER); +} + +void UBBD3Q27::outer(IBlock * block) +{ + run_impl(block, IndexVectors::OUTER); +} + +} // namespace lbm +} // namespace walberla + diff --git a/src/lbm_generated/boundary/UBBD3Q27.h b/src/lbm_generated/boundary/UBBD3Q27.h new file mode 100644 index 0000000000000000000000000000000000000000..b7836d6958677e9b221f74f37b014b3de35019c7 --- /dev/null +++ b/src/lbm_generated/boundary/UBBD3Q27.h @@ -0,0 +1,647 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file UBBD3Q27.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once +#include "core/DataTypes.h" + +#include "field/GhostLayerField.h" +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "blockforest/StructuredBlockForest.h" +#include "field/FlagField.h" +#include "core/debug/Debug.h" + +#include <set> +#include <vector> + + + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +namespace walberla { +namespace lbm { + + +class UBBD3Q27 +{ +public: + struct IndexInfo { + int32_t x; + int32_t y; + int32_t z; + int32_t dir; + IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {} + bool operator==(const IndexInfo & o) const { + return x == o.x && y == o.y && z == o.z && dir == o.dir; + } + }; + + + + class IndexVectors + { + public: + using CpuIndexVector = std::vector<IndexInfo>; + + enum Type { + ALL = 0, + INNER = 1, + OUTER = 2, + NUM_TYPES = 3 + }; + + IndexVectors() = default; + bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; } + + CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; } + IndexInfo * pointerCpu(Type t) { return cpuVectors_[t].data(); } + + void syncGPU() + { + + } + + private: + std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES}; + + + }; + + UBBD3Q27( const shared_ptr<StructuredBlockForest> & blocks, + BlockDataID pdfsID_, double u_x, double u_y, double u_z) + : pdfsID(pdfsID_), u_x_(u_x), u_y_(u_y), u_z_(u_z) + { + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UBBD3Q27"); + }; + + void run (IBlock * block); + + void operator() (IBlock * block) + { + run(block); + } + + void inner (IBlock * block); + + void outer (IBlock * block); + + std::function<void (IBlock *)> getSweep() + { + return [this] + (IBlock * b) + { this->run(b); }; + } + + std::function<void (IBlock *)> getInnerSweep() + { + return [this] + (IBlock * b) + { this->inner(b); }; + } + + std::function<void (IBlock *)> getOuterSweep() + { + return [this] + (IBlock * b) + { this->outer(b); }; + } + + template<typename FlagField_T> + void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID) + { + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID ); + } + + + template<typename FlagField_T> + void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID, + FlagUID boundaryFlagUID, FlagUID domainFlagUID ) + { + auto * indexVectors = block->getData< IndexVectors > ( indexVectorID ); + auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL); + auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER); + auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER); + + auto * flagField = block->getData< FlagField_T > ( flagFieldID ); + + + if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) )) + return; + + auto boundaryFlag = flagField->getFlag(boundaryFlagUID); + auto domainFlag = flagField->getFlag(domainFlagUID); + + auto inner = flagField->xyzSize(); + inner.expand( cell_idx_t(-1) ); + + indexVectorAll.clear(); + indexVectorInner.clear(); + indexVectorOuter.clear(); + + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 0 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 1 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 2 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 3 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 4 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 5 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 6 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 7 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 8 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 9 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 10 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 11 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 12 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 13 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 14 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 15 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 16 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 17 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 18 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 19 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 20 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 21 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 22 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 23 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 24 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 25 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it ) + { + if( ! isFlagSet(it, domainFlag) ) + continue; + + if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) ) + { + auto element = IndexInfo(it.x(), it.y(), it.z(), 26 ); + + indexVectorAll.push_back( element ); + if( inner.contains( it.x(), it.y(), it.z() ) ) + indexVectorInner.push_back( element ); + else + indexVectorOuter.push_back( element ); + } + } + + + + + indexVectors->syncGPU(); + } + +private: + void run_impl(IBlock * block, IndexVectors::Type type); + + BlockDataID indexVectorID; + +public: + BlockDataID pdfsID; + double u_x_; + double u_y_; + double u_z_; +}; + + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/boundary/boundary_generation_script.py b/src/lbm_generated/boundary/boundary_generation_script.py new file mode 100644 index 0000000000000000000000000000000000000000..970daedc56e562b2aba05fa6e9147ad7c889cda0 --- /dev/null +++ b/src/lbm_generated/boundary/boundary_generation_script.py @@ -0,0 +1,55 @@ +import sympy as sp + +from pystencils import Target + +from lbmpy.creationfunctions import create_lb_method +from lbmpy import LBMConfig, Stencil, Method, LBStencil +from lbmpy.boundaries import ExtrapolationOutflow, FixedDensity, FreeSlip, NoSlip, UBB + +from pystencils_walberla import ManualCodeGenerationContext, generate_info_header +from lbmpy_walberla.boundary_collection import generate_boundary_collection +from lbmpy_walberla import lbm_boundary_generator + +with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False, + mpi=True, double_accuracy=True, cuda=False) as ctx: + + for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]: + target = Target.GPU if ctx.cuda else Target.CPU + data_type = "float64" if ctx.double_accuracy else "float32" + + method = Method.SRT + relaxation_rate = sp.symbols("omega") + streaming_pattern = 'pull' + + lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate, + streaming_pattern=streaming_pattern) + + lb_method = create_lb_method(lbm_config=lbm_config) + + outflow_west_boundary = ExtrapolationOutflow(normal_direction=(1, 0, 0), lb_method=lb_method) + fixed_density_boundary = FixedDensity(density=sp.Symbol("density")) + free_slip_boundary = FreeSlip(stencil) + no_slip_boundary = NoSlip() + ubb_boundary = UBB(sp.symbols("u_x, u_y, u_z"), data_type=data_type) + + outflow = lbm_boundary_generator(class_name=f'Outflow{stencil.name}', flag_uid='Outflow', + boundary_object=outflow_west_boundary) + + fixed_density = lbm_boundary_generator(class_name=f'FixedDensity{stencil.name}', flag_uid='FixedDensity', + boundary_object=fixed_density_boundary) + + free_slip = lbm_boundary_generator(class_name=f'FreeSlip{stencil.name}', flag_uid='FreeSlip', + boundary_object=free_slip_boundary) + + no_slip = lbm_boundary_generator(class_name=f'NoSlip{stencil.name}', flag_uid='NoSlip', + boundary_object=no_slip_boundary) + + ubb = lbm_boundary_generator(class_name=f'UBB{stencil.name}', flag_uid='UBB', + boundary_object=ubb_boundary) + + boundaries = [outflow, fixed_density, free_slip, no_slip, ubb] + generate_boundary_collection(ctx, f'{stencil.name}BoundaryCollection', boundary_generators=boundaries, + lb_method=lb_method, streaming_pattern=streaming_pattern, + target=target) + + ctx.write_all_files() diff --git a/src/lbm_generated/communication/CMakeLists.txt b/src/lbm_generated/communication/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd5516b9e96a757f9d8911269d7f703b13e92105 --- /dev/null +++ b/src/lbm_generated/communication/CMakeLists.txt @@ -0,0 +1,9 @@ +target_sources( lbm_generated + PRIVATE + CombinedInPlacePackInfo.h + NonuniformCommData.h + NonuniformCommData.impl.h + NonuniformGeneratedPdfPackInfo.h + NonuniformGeneratedPdfPackInfo.impl.h + UniformGeneratedPdfPackInfo.h + ) diff --git a/src/lbm_generated/communication/CombinedInPlacePackInfo.h b/src/lbm_generated/communication/CombinedInPlacePackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..b5a4c0ba2fd0fc6a9b2816ecbe38cff2af1dd150 --- /dev/null +++ b/src/lbm_generated/communication/CombinedInPlacePackInfo.h @@ -0,0 +1,117 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CombinedInPlacePackInfo.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once +#include "communication/UniformPackInfo.h" + +namespace walberla::lbm_generated { + +template< typename LatticeStorageSpecification_T, typename EvenPackInfo, typename OddPackInfo > +class CombinedInPlaceCpuPackInfo : public ::walberla::communication::UniformPackInfo +{ + public: + template< typename... Args > + CombinedInPlaceCpuPackInfo(std::shared_ptr< LatticeStorageSpecification_T >& storageSecification, Args&&... args) + : storageSecification_(storageSecification), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...) + {} + + ~CombinedInPlaceCpuPackInfo() override = default; + bool constantDataExchange() const override { return true; } + bool threadsafeReceiving() const override { return true; } + + void unpackData(IBlock* receiver, stencil::Direction dir, mpi::RecvBuffer& buffer) override + { + if (storageSecification_->isEvenTimeStep()) + { + return evenPackInfo_.unpackData(receiver, dir, buffer); + } + else + { + return oddPackInfo_.unpackData(receiver, dir, buffer); + } + } + + void communicateLocal(const IBlock* sender, IBlock* receiver, stencil::Direction dir) override + { + if (storageSecification_->isEvenTimeStep()) + { + return evenPackInfo_.communicateLocal(sender, receiver, dir); + } + else + { + return oddPackInfo_.communicateLocal(sender, receiver, dir); + } + } + + void packDataImpl(const IBlock* sender, stencil::Direction dir, mpi::SendBuffer& outBuffer) const override + { + if (storageSecification_->isEvenTimeStep()) + { + return evenPackInfo_.packDataImpl(sender, dir, outBuffer); + } + else + { + return oddPackInfo_.packDataImpl(sender, dir, outBuffer); + } + } + + void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const + { + if (storageSecification_->isEvenTimeStep()) + { + evenPackInfo_.pack(dir, buffer, block); + } + else + { + oddPackInfo_.pack(dir, buffer, block); + } + } + + void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const + { + if (storageSecification_->isEvenTimeStep()) + { + evenPackInfo_.unpack(dir, buffer, block); + } + else + { + oddPackInfo_.unpack(dir, buffer, block); + } + } + + uint_t size(stencil::Direction dir, IBlock* block) const + { + if (storageSecification_->isEvenTimeStep()) + { + return evenPackInfo_.size(dir, block); + } + else + { + return oddPackInfo_.size(dir, block); + } + } + + private: + const std::shared_ptr< LatticeStorageSpecification_T >& storageSecification_; + EvenPackInfo evenPackInfo_; + OddPackInfo oddPackInfo_; +}; + +} // namespace walberla::lbm_generated diff --git a/src/lbm_generated/communication/NonuniformCommData.h b/src/lbm_generated/communication/NonuniformCommData.h new file mode 100644 index 0000000000000000000000000000000000000000..762dde86c5cf2a8336e3791dd3b56274b5f26df3 --- /dev/null +++ b/src/lbm_generated/communication/NonuniformCommData.h @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformCommData.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/StructuredBlockForest.h" +#include "blockforest/BlockDataHandling.h" + +#include "domain_decomposition/IBlock.h" + +#include "field/FlagField.h" + +#include "lbm_generated/field/PdfField.h" + +#include "stencil/Directions.h" + +#define USE_CELL_INTERVALS + +namespace walberla::lbm_generated { + +using PartialCoalescenceMaskField = FlagField< uint32_t >; + +namespace util { + void forEachSubdirection(const Vector3< cell_idx_t > mainDirection, const std::function< void(Vector3< cell_idx_t >) >& func); + bool forEachSubdirectionCancel(const Vector3< cell_idx_t > mainDirection, + const std::function< bool(Vector3< cell_idx_t >) >& func); + void getSubdirections(const Vector3< cell_idx_t > mainDirection, std::vector< Vector3< cell_idx_t > > subdirs); + + template< typename Stencil_T > + void forEachOrthogonalDirection(Vector3<cell_idx_t> d, std::function< void(Vector3< cell_idx_t >) > func); +} // namespace util + +template< typename LatticeStorageSpecification_T > +class NonuniformCommData +{ + private: + void registerFlags(); + void computeBitMask(); + + public: + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + +#if defined(USE_CELL_INTERVALS) + NonuniformCommData(IBlock* const block, uint_t xSize, uint_t ySize, uint_t zSize) + : block_(block), maskField_(xSize, ySize, zSize, 2), + interiorInterval(0, 0, 0, cell_idx_c(xSize) - 1, cell_idx_c(ySize) - 1, cell_idx_c(zSize) - 1) + { + registerFlags(); + computeBitMask(); + }; +#else + NonuniformCommData(IBlock* const block, const BlockDataID pdfFieldID, uint_t xSize, uint_t ySize, uint_t zSize) + : block_(block), pdfFieldID_(pdfFieldID), maskField_(xSize, ySize, zSize, 2) + { + registerFlags(); + computeBitMask(); + }; +#endif + + bool operator==(const NonuniformCommData& other) { return this == &other; } + bool operator!=(const NonuniformCommData& other) { return this != &other; } + + PartialCoalescenceMaskField& getMaskField() { return maskField_; } + const PartialCoalescenceMaskField& getMaskField() const { return maskField_; } + + private: +#if defined(USE_CELL_INTERVALS) + void prepareIntervals(); + void setFlagOnInterval(const CellInterval & ci, const uint_t fIdx); +#else + void prepareFlags(); + void resetCornerSkippingOriginFlags(); +#endif + + void setupCornerSkippingOrigins(stencil::Direction commDir); + void setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir); + + bool haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir); + + const IBlock* const block_; + PartialCoalescenceMaskField maskField_; + +#if defined(USE_CELL_INTERVALS) + const CellInterval interiorInterval; + std::vector< CellInterval > passThroughIntervals_; + std::vector< CellInterval > cornerSkippingOriginIntervals_; +#endif +}; + + +template< typename LatticeStorageSpecification_T > +class NonuniformCommDataHandling + : public blockforest::AlwaysInitializeBlockDataHandling< NonuniformCommData< LatticeStorageSpecification_T > > +{ + public: + using CommmData_T = NonuniformCommData< LatticeStorageSpecification_T >; + + NonuniformCommDataHandling(const weak_ptr< StructuredBlockForest >& blocks) + : blocks_(blocks){}; + + CommmData_T* initialize(IBlock* const block) override + { + WALBERLA_ASSERT_NOT_NULLPTR(block) + auto blocks = blocks_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(blocks) + + return new CommmData_T(block, blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block), + blocks->getNumberOfZCells(*block)); + } + + private: + const weak_ptr< StructuredBlockStorage > blocks_; +}; + +} // walberla::lbm_generated + +#include "lbm_generated/communication/NonuniformCommData.impl.h" diff --git a/src/lbm_generated/communication/NonuniformCommData.impl.h b/src/lbm_generated/communication/NonuniformCommData.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..5a4bc3293087a5ed1e0c1aef261381511d908371 --- /dev/null +++ b/src/lbm_generated/communication/NonuniformCommData.impl.h @@ -0,0 +1,400 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformCommData.impl.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/all.h" + +#include "lbm_generated/communication/NonuniformCommData.h" + +#include "stencil/Directions.h" + +#define IDX_FLAG(d) (1 << d) + +#if !defined(USE_CELL_INTERVALS) +#define INTERIOR_FLAG_BIT 29 +#define INTERIOR_FLAG (1 << INTERIOR_FLAG_BIT) + +#define PASS_THROUGH_FLAG_BIT 30 +#define PASS_THROUGH_FLAG (1 << PASS_THROUGH_FLAG_BIT) + +#define CORNER_SKIPPING_ORIGIN_FLAG_BIT 31 +#define CORNER_SKIPPING_ORIGIN_FLAG (1 << CORNER_SKIPPING_ORIGIN_FLAG_BIT) +#endif + +using namespace walberla::lbm_generated::util; + +namespace walberla::lbm_generated { +namespace util { + +/*********************************************************************************************************************** + * Utility Functions for handling directions * + **********************************************************************************************************************/ + +/** + * Iterates all sub-directions of a given direction vector and runs a callback on each of them. + * Subdirections are any nonzero directions obtained by truncating zero or more components of a direction + * vector to zero. The direction vector itself is contained in this set. + * @param mainDirection The direction whose subdirections will be iterated + * @param func The callback that should be run for each subdirection + */ +inline void forEachSubdirection(const Vector3< cell_idx_t > mainDirection, + const std::function< void(Vector3< cell_idx_t >) >& func) +{ + for (cell_idx_t z = std::min(0, mainDirection[2]); z <= std::max(0, mainDirection[2]); z++) + { + for (cell_idx_t y = std::min(0, mainDirection[1]); y <= std::max(0, mainDirection[1]); y++) + { + for (cell_idx_t x = std::min(0, mainDirection[0]); x <= std::max(0, mainDirection[0]); x++) + { + if (x == 0 && y == 0 && z == 0) continue; + func(Vector3< cell_idx_t >(x, y, z)); + } + } + } +} + +/** + * Iterates all sub-directions of a given direction vector and runs a callback on each of them. + * Subdirections are any nonzero directions obtained by truncating zero or more components of a direction + * vector to zero. The direction vector itself is contained in this set. + * @param mainDirection The direction whose subdirections will be iterated + * @param func The callback that should be run for each subdirection. If the callback returns false, the + * iteration will be stopped. + * @return true if the iteration completed, false if it was canceled + */ +inline bool forEachSubdirectionCancel(const Vector3< cell_idx_t > mainDirection, + const std::function< bool(Vector3< cell_idx_t >) >& func) +{ + for (cell_idx_t z = std::min(0, mainDirection[2]); z <= std::max(0, mainDirection[2]); z++) + { + for (cell_idx_t y = std::min(0, mainDirection[1]); y <= std::max(0, mainDirection[1]); y++) + { + for (cell_idx_t x = std::min(0, mainDirection[0]); x <= std::max(0, mainDirection[0]); x++) + { + if (x == 0 && y == 0 && z == 0) continue; + if (!func(Vector3< cell_idx_t >(x, y, z))) return false; + } + } + } + + return true; +} + +inline void getSubdirections(const Vector3< cell_idx_t > mainDirection, + std::vector< Vector3< cell_idx_t > > subdirections) +{ + forEachSubdirection(mainDirection, [&](Vector3< cell_idx_t > v) { subdirections.push_back(v); }); +} + +/** + * Iterates all directions orthogonal to d that are part of the given stencil, and executes a function on + * each of them. + * @tparam Stencil_T The underlying stencil + * @param d + * @param func + */ +template< typename Stencil_T > +inline void forEachOrthogonalDirection(Vector3< cell_idx_t > d, std::function< void(Vector3< cell_idx_t >) > func) +{ + for (cell_idx_t x = (d[0] == 0 ? -1 : 0); x <= (d[0] == 0 ? 1 : 0); x++) + for (cell_idx_t y = (d[1] == 0 ? -1 : 0); y <= (d[1] == 0 ? 1 : 0); y++) + for (cell_idx_t z = (d[2] == 0 ? -1 : 0); z <= (d[2] == 0 ? 1 : 0); z++) + { + if (x == 0 && y == 0 && z == 0) continue; + if (Stencil_T::containsDir(stencil::vectorToDirection(x, y, z))) { func(Vector3(x, y, z)); } + } +} + +} // namespace util + +/*********************************************************************************************************************** + * Bit Mask Computation * + **********************************************************************************************************************/ + +template< typename LatticeStorageSpecification_T > +void NonuniformCommData< LatticeStorageSpecification_T >::registerFlags() +{ +#if !defined(USE_CELL_INTERVALS) + maskField_.registerFlag(FlagUID(true), INTERIOR_FLAG_BIT); + maskField_.registerFlag(FlagUID(true), PASS_THROUGH_FLAG_BIT); + maskField_.registerFlag(FlagUID(true), CORNER_SKIPPING_ORIGIN_FLAG_BIT); +#endif + + for(auto it = Stencil::beginNoCenter(); it != Stencil::end(); ++it){ + maskField_.registerFlag(FlagUID(true), Stencil::idx[*it]); + } +} + +#if defined(USE_CELL_INTERVALS) + +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::prepareIntervals() +{ + passThroughIntervals_.clear(); + const Block * b = dynamic_cast< const Block * >(block_); + + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + CellInterval ci; + maskField_.getGhostRegion(*commDir, ci, 2); + passThroughIntervals_.push_back(ci); + } + } +} + +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::setFlagOnInterval(const CellInterval & ci, + const uint_t fIdx) +{ + for(auto c : ci){ + maskField_.addFlag(c, IDX_FLAG(fIdx)); + } +} + +#else + +/** + * Prepares the INTERIOR and PASS_THROUGH flags. + * Sets the domain interior to INTERIOR. Sets any ghost layers corresponding to a coarse block + * or no block to PASS_THROUGH. + */ +template< typename LatticeStorageSpecification_T > +void NonuniformCommData< LatticeStorageSpecification_T >::prepareFlags() +{ + const Block * b = dynamic_cast< const Block * >(block_); + + // Set interior to origin + for (auto it = maskField_.beginXYZ(); it != maskField_.end(); ++it) + { + maskField_.addFlag(it.cell(), INTERIOR_FLAG); + } + + // Set GLs to pass-through + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){ + maskField_.addFlag(it.cell(), PASS_THROUGH_FLAG); + } + } + } +} + +/** + * Resets the origin flag on any ghost layers. + */ +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::resetCornerSkippingOriginFlags() +{ + const Block * b = dynamic_cast< const Block * >(block_); + + // Remove origin flag from any ghost layers + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){ + maskField_.removeFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG); + } + } + } +} + +#endif + + +/** + * Determines whether the current block has the smallest BlockID among all fine blocks of a + * given intersection volume. + * @tparam LatticeStorageSpecification_T + * @param cornerDir + * @return + */ +template< typename LatticeStorageSpecification_T > +inline bool NonuniformCommData< LatticeStorageSpecification_T >::haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir) +{ + const IBlockID& myId = block_->getId(); + const Block* b = dynamic_cast< const Block* >(block_); + return forEachSubdirectionCancel(cornerDir, [&](Vector3< cell_idx_t > dirVec) { + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]); + if (b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)) + { + if (b->getNeighbor(nSecIdx, 0).getId() < myId) return false; + } + return true; + }); +} + + +/** + * Sets up the feasible space for the given communication direction. + * Additionally to the field interior, marks every ghost layer slice corresponding to an adjacent coarse block, + * and the corresponding corner as feasible, if that corner also belongs to a coarse block and the current block + * has the smallest BlockID participating in the intersection. + * @param commDir A communication direction pointing toward an adjacent coarse block + */ +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::setupCornerSkippingOrigins(stencil::Direction commDir) +{ +#if defined(USE_CELL_INTERVALS) + cornerSkippingOriginIntervals_.clear(); +#else + resetCornerSkippingOriginFlags(); +#endif + + const Block* b = dynamic_cast< const Block* >(block_); + Vector3<cell_idx_t> commDirVec(stencil::cx[commDir], stencil::cy[commDir], stencil::cz[commDir]); + + // Iterate all orthogonal comm directions + forEachOrthogonalDirection< CommunicationStencil >(commDirVec, [&](Vector3< cell_idx_t > toSourceVec) { + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(toSourceVec[0], toSourceVec[1], toSourceVec[2]); + // Find if there is a coarse block or no block at all in this neighborhood + // There are three possibilities: Coarse block, Same-level block or no block + // Finer block is not possible because of 2:1 balance + if (!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)) + { + // From this adjacent coarse block (or not-block, for boundary handling), corner skipping must be handled. + // Also, if there is no block, boundary handling in that region must be done on only + // one of the participating fine blocks. + Vector3< cell_idx_t > cornerDirVec = toSourceVec + commDirVec; + + // If the current block has the smallest participating ID... + if (haveSmallestIdInIntersection(cornerDirVec)) + { + const stencil::Direction toSourceDir = stencil::vectorToDirection(toSourceVec); + + // ... Mark source GL region as corner skipping origin. +#if defined(USE_CELL_INTERVALS) + CellInterval ci; + maskField_.getGhostRegion(toSourceDir, ci, 2); + cornerSkippingOriginIntervals_.push_back(ci); +#else + for (auto it = maskField_.beginGhostLayerOnlyXYZ(toSourceDir); it != maskField_.end(); ++it) + { + maskField_.addFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG); + } +#endif + } + } + }); +} + + +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir) +{ + uint_t fIdx = Stencil::idx[streamDir]; + Cell streamVec(stencil::cx[streamDir], stencil::cy[streamDir], stencil::cz[streamDir]); + +#if defined(USE_CELL_INTERVALS) + CellInterval commSliceInterval; + maskField_.getGhostRegion(commDir, commSliceInterval, 2); + + // Shift back once + commSliceInterval.shift(-streamVec); + + // Intersect with interior and set flag on intersection volume + CellInterval interiorIntersection(interiorInterval); + interiorIntersection.intersect(commSliceInterval); + if(!interiorIntersection.empty()){ + interiorIntersection.shift(streamVec); + setFlagOnInterval(interiorIntersection, fIdx); + } + + // Intersect with pass-through regions... + for(auto passThroughIntersection : std::as_const(passThroughIntervals_)){ + passThroughIntersection.intersect(commSliceInterval); + if(passThroughIntersection.empty()) continue; + + // ... shift back once more ... + passThroughIntersection.shift(-streamVec); + + // ... intersect with interior ... + interiorIntersection = interiorInterval; + interiorIntersection.intersect(passThroughIntersection); + if(!interiorIntersection.empty()){ + interiorIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z()); + setFlagOnInterval(interiorIntersection, fIdx); + } + + // ... and with corner-skipping origin regions + for(auto originIntersection : std::as_const(cornerSkippingOriginIntervals_)){ + originIntersection.intersect(passThroughIntersection); + if(!originIntersection.empty()){ + originIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z()); + setFlagOnInterval(originIntersection, fIdx); + } + } + } +#else + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, commDir); it != maskField_.end(); ++it){ + Cell currentCell = it.cell(); + + // Shift back once + Cell shiftedCell = currentCell - streamVec; + + if (maskField_.isFlagSet(shiftedCell, INTERIOR_FLAG)){ + maskField_.addFlag(currentCell, IDX_FLAG(fIdx)); + } + else if (maskField_.isFlagSet(shiftedCell, PASS_THROUGH_FLAG)){ + // Shift back twice + shiftedCell -= streamVec; + if (maskField_.isPartOfMaskSet(shiftedCell, INTERIOR_FLAG | CORNER_SKIPPING_ORIGIN_FLAG)){ + maskField_.addFlag(currentCell, IDX_FLAG(fIdx)); + } + + } + // else continue; + } +#endif +} + +/** + * Computes the partial coalescence bit mask on the mask field. + * Assumes that all flags are already registered at the field, and that the field + * has been initialized to zero. + */ +template< typename LatticeStorageSpecification_T > +void NonuniformCommData< LatticeStorageSpecification_T >::computeBitMask() +{ +#if defined(USE_CELL_INTERVALS) + prepareIntervals(); +#else + prepareFlags(); +#endif + + const Block* b = dynamic_cast< const Block* >(block_); + for(auto commIt = CommunicationStencil::beginNoCenter(); commIt != CommunicationStencil::end(); ++commIt){ + stencil::Direction commDir = *commIt; + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(commDir); + if(b->neighborhoodSectionHasLargerBlock(nSecIdx)){ + setupCornerSkippingOrigins(commDir); + + for(uint_t streamDirIdx = 0; streamDirIdx < Stencil::d_per_d_length[commDir]; streamDirIdx++){ + stencil::Direction streamDir = Stencil::d_per_d[commDir][streamDirIdx]; + setupBitMaskSlice(commDir, streamDir); + } + } + } +} + +} // walberla::lbm_generated diff --git a/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..1b3e43a51dd7e7e8965e2152c58e493f73d8af84 --- /dev/null +++ b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h @@ -0,0 +1,317 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGeneratedPdfPackInfo.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/communication/NonUniformPackInfo.h" + +#include "core/DataTypes.h" +#include "core/mpi/RecvBuffer.h" +#include "core/mpi/SendBuffer.h" + +#include "lbm_generated/communication/NonuniformCommData.h" +#include "lbm_generated/field/PdfField.h" + +namespace walberla::lbm_generated { +using stencil::Direction; + +namespace internal +{ +/* + * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to + * kernels generated for in-place streaming patterns. The generated code should not be templated. + */ +template< typename PdfField_T, bool inplace > +class NonuniformPackingKernelsWrapper +{ + public: + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const = 0; + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const = 0; + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const = 0; + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const = 0; + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const = 0; + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir) const = 0; + + void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, + stencil::Direction dir) const = 0; + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci, + unsigned char* outBuffer, Direction dir) const = 0; + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const = 0; + void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const = 0; + + uint_t size(CellInterval& ci, Direction dir) const = 0; + uint_t size(CellInterval& ci) const = 0; + uint_t redistributeSize(CellInterval& ci) const = 0; + uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const = 0; +}; + +/* + * Template Specialization for two-fields patterns, with trivial method wrappers. + */ +template< typename PdfField_T > +class NonuniformPackingKernelsWrapper< PdfField_T, false > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const + { + kernels_.packAll(srcField, ci, outBuffer); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const + { + kernels_.unpackAll(dstField, ci, inBuffer); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const + { + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const + { + kernels_.packDirection(srcField, ci, outBuffer, dir); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const + { + kernels_.unpackDirection(dstField, ci, inBuffer, dir); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir) const + { + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir); + } + + void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, + stencil::Direction dir) const + { + kernels_.unpackRedistribute(dstField, ci, inBuffer, dir); + } + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci, + unsigned char* outBuffer, Direction dir) const + { + kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir); + } + + void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const + { + kernels_.unpackCoalescence(dstField, ci, inBuffer, dir); + } + + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const + { + kernels_.zeroCoalescenceRegion(dstField, ci, dir); + } + + uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + uint_t redistributeSize(CellInterval& ci) const { return kernels_.redistributeSize(ci); } + uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const + { + return kernels_.partialCoalescenceSize(ci, dir); + } + + private: + PackingKernels_T kernels_; +}; + +/* + * Template Specialization for in-place patterns, extracting the timestep from the lattice model. + */ +template< typename PdfField_T > +class NonuniformPackingKernelsWrapper< PdfField_T, true > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packAll(srcField, ci, outBuffer, timestep); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackAll(dstField, ci, inBuffer, timestep); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packDirection(srcField, ci, outBuffer, dir, timestep); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep); + } + + void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, + stencil::Direction dir) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep); + } + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci, + unsigned char* outBuffer, Direction dir) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep); + } + + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.zeroCoalescenceRegion(dstField, ci, dir, timestep); + } + + void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, timestep); + } + + uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + uint_t redistributeSize(CellInterval& ci) const { return kernels_.redistributeSize(ci); } + uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const + { + return kernels_.partialCoalescenceSize(ci, dir); + } + + private: + PackingKernels_T kernels_; +}; +} // namespace internal + +/*********************************************************************************************************************** + * Class Declaration * + **********************************************************************************************************************/ + +template< typename PdfField_T > +class NonuniformGeneratedPdfPackInfo : public blockforest::communication::NonUniformPackInfo +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + using CommData_T = NonuniformCommData< LatticeStorageSpecification_T >; + + + NonuniformGeneratedPdfPackInfo(const BlockDataID pdfFieldID, const BlockDataID commDataID) + : pdfFieldID_(pdfFieldID), commDataID_(commDataID){}; + + bool constantDataExchange() const override { return true; }; + bool threadsafeReceiving() const override { return false; }; + + /// Equal Level + void unpackDataEqualLevel(Block* receiver, Direction dir, mpi::RecvBuffer& buffer) override; + void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir) override; + + /// Coarse to Fine + void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir, + mpi::RecvBuffer& buffer) override; + void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override; + + /// Fine to Coarse + void prepareCoalescence(Block* coarseReceiver); + void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir, + mpi::RecvBuffer& buffer) override; + + void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override; + + protected: + void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, mpi::SendBuffer& buffer) const override; + + void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, + mpi::SendBuffer& buffer) const override; + void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, + mpi::SendBuffer& buffer) const override; + + private: + /// Helper Functions + /// As in PdfFieldPackInfo.h + Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const; + bool areNeighborsInDirection(const Block * block, const BlockID & neighborID, const Vector3< cell_idx_t> dirVec) const; + + CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir, + cell_idx_t width) const; + bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const; + + void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const; + void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const; + + CellInterval getCoarseBlockCoalescenceInterval(const Block * coarseBlock, const BlockID & fineBlockID, + Direction dir, const PdfField_T * field) const; + + const BlockDataID pdfFieldID_; + internal::NonuniformPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_; + + public: + const BlockDataID commDataID_; +}; + +/*********************************************************************************************************************** + * Factory Functions * + **********************************************************************************************************************/ + +template< typename PdfField_T> +std::shared_ptr< NonuniformGeneratedPdfPackInfo< PdfField_T > > + setupNonuniformPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks, const BlockDataID pdfFieldID, + const std::string& dataIdentifier = "NonuniformCommData"); + +} // walberla::lbm_generated + +#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h" diff --git a/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..cf36a61f9813989b5e975e6782f5c3ea138a3e96 --- /dev/null +++ b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h @@ -0,0 +1,490 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGeneratedPdfPackInfo.impl.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "NonuniformGeneratedPdfPackInfo.h" + +using namespace walberla::lbm_generated::util; + +namespace walberla::lbm_generated { + +/*********************************************************************************************************************** + * Factory Functions * + **********************************************************************************************************************/ + + +/** + * Sets up a NonuniformGeneratedPdfPackInfo. + * + * @tparam LatticeStorageSpecification_T + * @tparam PackingKernels_T + * @param blocks + * @param pdfFieldID + * @param dataIdentifier + * @return + */ +template< typename PdfField_T> +std::shared_ptr< NonuniformGeneratedPdfPackInfo< PdfField_T > > +setupNonuniformPdfCommunication( const std::weak_ptr< StructuredBlockForest > & blocks, + const BlockDataID pdfFieldID, + const std::string & dataIdentifier) +{ + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + + auto sbf = blocks.lock(); + WALBERLA_CHECK_NOT_NULLPTR(sbf) + + auto handling = std::make_shared<NonuniformCommDataHandling< LatticeStorageSpecification_T > >(blocks); + BlockDataID commDataID = sbf->addBlockData(handling, dataIdentifier); + + return std::make_shared<NonuniformGeneratedPdfPackInfo< PdfField_T > >(pdfFieldID, commDataID); +} + + +/*********************************************************************************************************************** + * Equal Level Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver, + Direction dir, + mpi::RecvBuffer& buffer) +{ + auto field = receiver->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + cell_idx_t gls = skipsThroughCoarseBlock(receiver, dir) ? 2 : 1; + field->getGhostRegion(dir, ci, gls, false); + uint_t size = kernels_.size(ci, dir); + unsigned char* bufferPtr = buffer.skip(size); + kernels_.unpackDirection(field, ci, bufferPtr, dir); +} + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalEqualLevel( + const Block* sender, Block* receiver, stencil::Direction dir) +{ + auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); + + CellInterval srcRegion; + CellInterval dstRegion; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false); + kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir); +} + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataEqualLevelImpl( + const Block* sender, stencil::Direction dir, mpi::SendBuffer& buffer) const +{ + auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + field->getSliceBeforeGhostLayer(dir, ci, gls, false); + unsigned char* bufferPtr = buffer.forward(kernels_.size(ci, dir)); + kernels_.packDirection(field, ci, bufferPtr, dir); +} + +/*********************************************************************************************************************** + * Coarse to Fine Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl( + const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, mpi::SendBuffer& buffer) const +{ + auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals); + + for (auto t : intervals) + { + CellInterval ci = t.second; + unsigned char* bufferPtr = buffer.forward(kernels_.size(ci)); + kernels_.packAll(field, ci, bufferPtr); + } +} + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataCoarseToFine( + Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, mpi::RecvBuffer& buffer) +{ + auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals); + + for (auto t : intervals) + { + Direction d = t.first; + CellInterval ci = t.second; + uint_t size = kernels_.redistributeSize(ci); + unsigned char* bufferPtr = buffer.skip(size); + kernels_.unpackRedistribute(field, ci, bufferPtr, d); + } +} + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine( + const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) +{ + auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > srcIntervals; + getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals); + + std::vector< std::pair< Direction, CellInterval > > dstIntervals; + getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals); + + WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size()) + + for(size_t index = 0; index < srcIntervals.size(); index++) + { + CellInterval srcInterval = srcIntervals[index].second; + + Direction unpackDir = dstIntervals[index].first; + CellInterval dstInterval = dstIntervals[index].second; + + uint_t packSize = kernels_.size(srcInterval); + +#ifndef NDEBUG + Direction const packDir = srcIntervals[index].first; + WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) + uint_t unpackSize = kernels_.redistributeSize(dstInterval); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + // TODO: This is a dirty workaround. Code-generate direct redistribution! + std::vector< unsigned char > buffer(packSize); + kernels_.packAll(srcField, srcInterval, &buffer[0]); + kernels_.unpackRedistribute(dstField, dstInterval, &buffer[0], unpackDir); + } +} + +/*********************************************************************************************************************** + * Fine to Coarse Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedPdfPackInfo< PdfField_T >::prepareCoalescence(Block* coarseReceiver) +{ + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + + for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); + if(coarseReceiver->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + CellInterval ci; + dstField->getSliceBeforeGhostLayer(*it, ci, 1); + kernels_.zeroCoalescenceRegion(dstField, ci, *it); + } + } +} + +template< typename PdfField_T> +void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataFineToCoarse( + walberla::Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir, + walberla::mpi::RecvBuffer& buffer) +{ + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + + CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField); + uint_t size = kernels_.size(ci, dir); + unsigned char* bufferPtr = buffer.skip(size); + kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir); +} + +template< typename PdfField_T> +void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse( + const walberla::Block* fineSender, walberla::Block* coarseReceiver, walberla::stencil::Direction dir) +{ + Block * varFineSender = const_cast< Block * >(fineSender); + auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); + auto srcCommData = varFineSender->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskField * maskField = &(srcCommData->getMaskField()); + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + Direction invDir = stencil::inverseDir[dir]; + + CellInterval srcInterval; + srcField->getGhostRegion(dir, srcInterval, 2); + uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); + + CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), + invDir, dstField); + +#ifndef NDEBUG + uint_t unpackSize = kernels_.size(dstInterval, invDir); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + // TODO: This is a dirty workaround. Code-generate direct redistribution! + std::vector< unsigned char > buffer(packSize); + kernels_.packPartialCoalescence(srcField, maskField, srcInterval, &buffer[0], dir); + kernels_.unpackCoalescence(dstField, dstInterval, &buffer[0], invDir); +} + +template< typename PdfField_T> +void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl( + const walberla::Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir, + walberla::mpi::SendBuffer& buffer) const +{ + Block* varBlock = const_cast< Block* >(fineSender); + auto srcField = varBlock->getData< PdfField_T >(pdfFieldID_); + auto commData = varBlock->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskField * maskField = &(commData->getMaskField()); + + CellInterval ci; + srcField->getGhostRegion(dir, ci, 2); + uint_t size = kernels_.partialCoalescenceSize(ci, dir); + unsigned char* bufferPtr = buffer.forward(size); + kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir); +} + +/*********************************************************************************************************************** + * Helper Functions * + **********************************************************************************************************************/ + +template< typename PdfField_T> +inline Vector3< cell_idx_t > +NonuniformGeneratedPdfPackInfo< PdfField_T >::getNeighborShift(const BlockID& fineBlock, + stencil::Direction dir) const +{ + // dir: direction from coarse to fine block, or vice versa + Vector3< cell_idx_t > shift; + + uint_t const branchId = fineBlock.getBranchId(); + + shift[0] = (stencil::cx[dir] == 0) ? (((branchId & uint_t(1)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0); + shift[1] = (stencil::cy[dir] == 0) ? (((branchId & uint_t(2)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0); + shift[2] = (Stencil::D == uint_t(3)) ? + ((stencil::cz[dir] == 0) ? (((branchId & uint_t(4)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0)) : + cell_idx_t(0); + + return shift; +} + +/** + * Returns the part of a cell interval's hull of given width in direction dirVec. + * @param ci The original cell interval + * @param dirVec Direction Vector + * @param width Width of the hull + * @return Interval forming the part of the hull + */ +template< typename PdfField_T> +inline CellInterval NonuniformGeneratedPdfPackInfo< PdfField_T >::intervalHullInDirection( + const CellInterval& ci, const Vector3< cell_idx_t > dirVec, cell_idx_t width) const +{ + CellInterval result(ci); + for (uint_t i = 0; i < Stencil::D; i++) + { + if (dirVec[i] == 1) + { + result.min()[i] = result.max()[i] + cell_idx_t(1); + result.max()[i] += width; + } + if (dirVec[i] == -1) + { + result.max()[i] = result.min()[i] - cell_idx_t(1); + result.min()[i] -= width; + } + } + + return result; +} + +/** + * For edge or corner directions, checks if a coarser block is part of the respective edge or corner intersection. + * @param block The local block + * @param dir The direction to check + * @return `true` if dir is an edge or corner direction skipping through a coarser block. + */ +template< typename PdfField_T> +inline bool NonuniformGeneratedPdfPackInfo< PdfField_T >::skipsThroughCoarseBlock( + const Block* block, const Direction dir) const +{ + Vector3< cell_idx_t > dirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]); + bool coarseBlockFound = false; + forEachSubdirectionCancel(dirVec, [&](Vector3< cell_idx_t > subdir) { + coarseBlockFound = + coarseBlockFound || block->neighborhoodSectionHasLargerBlock( + blockforest::getBlockNeighborhoodSectionIndex(subdir[0], subdir[1], subdir[2])); + return !coarseBlockFound; + }); + + return coarseBlockFound; +} + +/** + * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval) + * mapping sub-directions of the communication direction to cell intervals on the coarse block interior + * whose data must be communicated <i>as if</i> communicating in those sub-directions. + * @param fineBlockID ID of the fine block + * @param dir Direction from the coarse to the fine block + * @param field Pointer to the PDF field on the coarse block + * @param intervals Vector that will be filled with the computed intervals + */ +template< typename PdfField_T> +inline void NonuniformGeneratedPdfPackInfo< PdfField_T >::getCoarseBlockCommIntervals( + const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const +{ + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir); + + CellInterval mainSlice; + field->getSliceBeforeGhostLayer(dir, mainSlice, 1, false); + + // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift + for (uint_t i = 0; i != Stencil::D; ++i) + { + if (shift[i] == cell_idx_t(-1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1); + } + if (shift[i] == cell_idx_t(1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)); + } + } + + intervals.emplace_back(dir, mainSlice); + + Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] }; + + // Get extended slices in all tangential directions for the diagonal part of communication + forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) { + CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(1)); + Direction subCommDir = stencil::vectorToDirection(commDirVec - t); + if(CommunicationStencil::containsDir(subCommDir)){ + intervals.emplace_back(subCommDir, hullInterval); + } + }); +} + +/** + * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval) + * mapping sub-directions of the communication direction to cell intervals on the fine block whose data must + * be communicated <i>as if</i> communicating in those sub-directions. + * @param fineBlockID ID of the fine block + * @param dir Direction from the fine to the coarse block + * @param field Pointer to the PDF Field on the fine block + * @param intervals Vector that will be filled with the computed intervals + */ +template< typename PdfField_T> +inline void NonuniformGeneratedPdfPackInfo< PdfField_T >::getFineBlockCommIntervals( + const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const +{ + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir); + + CellInterval mainSlice; + field->getGhostRegion(dir, mainSlice, 2, false); + intervals.emplace_back(dir, mainSlice); + + Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] }; + + forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) { + CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(2)); + Direction subCommDir = stencil::vectorToDirection(commDirVec + t); + if(CommunicationStencil::containsDir(subCommDir)){ + intervals.emplace_back(subCommDir, hullInterval); + } + }); +} +/** + * Checks whether or not the block with ID `neighborID` is a neighbor of `block` in direction `dir`. + */ +template< typename PdfField_T> +bool NonuniformGeneratedPdfPackInfo< PdfField_T >::areNeighborsInDirection( + const Block* block, const BlockID& neighborID, const Vector3< cell_idx_t> dirVec) const +{ + uint_t const nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]); + uint_t const nSecSize = block->getNeighborhoodSectionSize(nSecIdx); + + for(uint_t i = 0; i < nSecSize; i++){ + if(block->getNeighborId(nSecIdx, i) == neighborID){ + return true; + } + } + return false; +} + +template< typename PdfField_T> +CellInterval NonuniformGeneratedPdfPackInfo< PdfField_T >::getCoarseBlockCoalescenceInterval( + const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, const PdfField_T* field) const +{ + Direction mainDir(dir); + Vector3< cell_idx_t > commDirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]); + Vector3< cell_idx_t > mainDirVec(commDirVec); + bool isAsymmetric = !areNeighborsInDirection(coarseBlock, fineBlockID, commDirVec); + + // If asymmetric, find the main subdirection + if(isAsymmetric){ + mainDirVec = Vector3< cell_idx_t >(0); + forEachSubdirection(commDirVec, [&](Vector3< cell_idx_t > subdirVec){ + if(areNeighborsInDirection(coarseBlock, fineBlockID, subdirVec)){ + // -dir is one main communication direction from F to C, but, due to periodicity, + // it might not be the only one. Find the main comm direction from the subdirections + // that is largest in the 1-norm. + if(subdirVec.sqrLength() > mainDirVec.sqrLength()) mainDirVec = subdirVec; + } + }); + mainDir = stencil::vectorToDirection(mainDirVec); + } + + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, mainDir); + + CellInterval mainSlice; + field->getSliceBeforeGhostLayer(mainDir, mainSlice, 1, false); + + // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift + for (uint_t i = 0; i != Stencil::D; ++i) + { + if (shift[i] == cell_idx_t(-1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1); + } + if (shift[i] == cell_idx_t(1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)); + } + } + + CellInterval commSlice(mainSlice); + + // If asymmetric, find coalescence slice as hull of main slice + if(isAsymmetric){ + commSlice = intervalHullInDirection(mainSlice, mainDirVec - commDirVec, 1); + } + + return commSlice; +} + +} // walberla::lbm_generated diff --git a/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h b/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..76d28617a2b7f7be888eb1ed84ecb945a23bc229 --- /dev/null +++ b/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h @@ -0,0 +1,291 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGeneratedPdfPackInfo.h +//! \ingroup lbm +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \brief Class Template for Lattice Boltzmann PDF Pack Infos using code-generated kernels +// +//====================================================================================================================== + +#pragma once + +#include "communication/UniformPackInfo.h" + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" + +#include "lbm/field/PdfField.h" + +#include "stencil/Directions.h" + +namespace walberla +{ +using communication::UniformPackInfo; + +namespace lbm_generated +{ +using stencil::Direction; + +namespace internal +{ +/* + * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to + * kernels generated for in-place streaming patterns. The generated code should not be templated. + */ +template< typename PdfField_T, bool inplace > +class UniformPackingKernelsWrapper +{ + public: + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const = 0; + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const = 0; + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const = 0; + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const = 0; + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const = 0; + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, const Direction dir) const = 0; + + uint_t size(CellInterval& ci, const Direction dir) const = 0; + uint_t size(CellInterval& ci) const = 0; +}; + +/* + * Template Specialization for two-fields patterns, with trivial method wrappers. + */ +template< typename PdfField_T > +class UniformPackingKernelsWrapper< PdfField_T, false > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const + { + kernels_.packAll(srcField, ci, outBuffer); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const + { + kernels_.unpackAll(dstField, ci, inBuffer); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const + { + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const + { + kernels_.packDirection(srcField, ci, outBuffer, dir); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const + { + kernels_.unpackDirection(dstField, ci, inBuffer, dir); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, const Direction dir) const + { + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir); + } + + uint_t size(CellInterval& ci, const Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + + private: + PackingKernels_T kernels_; +}; + +/* + * Template Specialization for in-place patterns, extracting the timestep from the lattice model. + */ +template< typename PdfField_T > +class UniformPackingKernelsWrapper< PdfField_T, true > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packAll(srcField, ci, outBuffer, timestep); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackAll(dstField, ci, inBuffer, timestep); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packDirection(srcField, ci, outBuffer, dir, timestep); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, const Direction dir) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep); + } + + uint_t size(CellInterval& ci, const Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + + private: + PackingKernels_T kernels_; +}; +} // namespace internal + +/** + * Pack Info class template for lattice Boltzmann PDF fields. Relies on a code-generated + * class providing kernel implementations for packing, unpacking and local copying of data. + * + * This template relies on a PackingKernels implementation generated by lbmpy_walberla.packing_kernels. + * The code generated part provides the kernels for transferring data between communication buffers + * and fields. The iteration slices are constructed by this class. + * + * The code-generated substructure enables the usage of arbitrary, in particular in-place streaming + * patterns. + * + * @tparam PackingKernels_T Type of a PackingKernels implementation generated using + * `lbmpy_walberla.generate_packing_kernels`. + * + * \ingroup lbm + */ +template< typename PdfField_T > +class UniformGeneratedPdfPackInfo : public UniformPackInfo +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + + /** + * Constructor. + * + * @param pdfFieldID ID of the associated walberla::lbm::PdfField + * @param cellLayersToSend The amount of cell layers that should be communicated + * @param sendAll If true, instead of only those populations streaming in subdirections of the communication + * direction, all populations will always be communicated. + * \warning Be careful when using this option with any streaming pattern other than + * the pull pattern. Other patterns store at least some of their post-collision + * populations in neighbouring cells. This might lead to out-of-bounds errors when + * copying to the outermost ghost layer! Solve this by adding an additional ghost layer + * as a safety margin. + */ + UniformGeneratedPdfPackInfo(const BlockDataID pdfFieldID, cell_idx_t cellLayersToSend = 1, bool sendAll = false) + : pdfFieldID_(pdfFieldID), ghostLayersToSend_(cellLayersToSend), sendAll_(sendAll) + {} + + bool constantDataExchange() const override { return true; } + bool threadsafeReceiving() const override { return true; } + + void unpackData(IBlock * receiver, Direction dir, mpi::RecvBuffer & buffer) override; + void communicateLocal(const IBlock * sender, IBlock * receiver, Direction dir) override; + + protected: + void packDataImpl(const IBlock * sender, Direction dir, mpi::SendBuffer & buffer) const override; + + private: + const BlockDataID pdfFieldID_; + internal::UniformPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_; + cell_idx_t ghostLayersToSend_; + bool sendAll_; +}; + +template< typename PdfField_T > +void UniformGeneratedPdfPackInfo< PdfField_T >::unpackData( IBlock * receiver, Direction dir, mpi::RecvBuffer& buffer) +{ + auto field = receiver->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + field->getGhostRegion(dir, ci, ghostLayersToSend_, false); + + if (sendAll_) + { + unsigned char* bufferPtr = buffer.skip(kernels_.size(ci)); + kernels_.unpackAll(field, ci, bufferPtr); + } + else + { + uint_t size = kernels_.size(ci, dir); + unsigned char* bufferPtr = buffer.skip(size); + kernels_.unpackDirection(field, ci, bufferPtr, dir); + } +} + +template< typename PdfField_T > +void UniformGeneratedPdfPackInfo< PdfField_T >::communicateLocal(const IBlock* sender, IBlock* receiver, Direction dir) +{ + auto srcField = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); + + CellInterval srcRegion; + CellInterval dstRegion; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, ghostLayersToSend_, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, ghostLayersToSend_, false); + + if (sendAll_) { + kernels_.localCopyAll(srcField, srcRegion, dstField, dstRegion); + } + else + { + kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir); + } +} + +template< typename PdfField_T> +void UniformGeneratedPdfPackInfo< PdfField_T >:: packDataImpl(const IBlock* sender, Direction dir, mpi::SendBuffer& buffer) const +{ + auto field = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + field->getSliceBeforeGhostLayer(dir, ci, ghostLayersToSend_, false); + + if (sendAll_) + { + unsigned char* bufferPtr = buffer.forward(kernels_.size(ci)); + kernels_.packAll(field, ci, bufferPtr); + } + else + { + unsigned char* bufferPtr = buffer.forward(kernels_.size(ci, dir)); + kernels_.packDirection(field, ci, bufferPtr, dir); + } +} + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/evaluation/CMakeLists.txt b/src/lbm_generated/evaluation/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..922cf93c3cb989af797b913baa227a2cf1735b23 --- /dev/null +++ b/src/lbm_generated/evaluation/CMakeLists.txt @@ -0,0 +1,4 @@ +target_sources( lbm_generated + PRIVATE + PerformanceEvaluation.h + ) diff --git a/src/lbm_generated/evaluation/PerformanceEvaluation.h b/src/lbm_generated/evaluation/PerformanceEvaluation.h new file mode 100644 index 0000000000000000000000000000000000000000..9fb7e934a2506ca360af12882a0775bcf8281eb6 --- /dev/null +++ b/src/lbm_generated/evaluation/PerformanceEvaluation.h @@ -0,0 +1,415 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file PerformanceEvaluation.h +//! \ingroup lbm_generated +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/Hostname.h" +#include "core/Set.h" +#include "core/waLBerlaBuildInfo.h" +#include "core/debug/CheckFunctions.h" +#include "core/logging/Logging.h" +#include "core/mpi/MPIManager.h" +#include "core/uid/SUID.h" + +#include "domain_decomposition/StructuredBlockStorage.h" + +#include "field/CellCounter.h" +#include "field/FlagUID.h" + +#include <cstdlib> +#include <map> +#include <string> +#include <sstream> + + +namespace walberla::lbm_generated { + + +//********************************************************************************************************************** +/*! +* \brief Class for evaluating the performance of LBM simulations +*/ +//********************************************************************************************************************** +template< typename CellCounter_T, typename FluidCellCounter_T > +class PerformanceEvaluationBase +{ +public: + + PerformanceEvaluationBase( const weak_ptr< StructuredBlockStorage > & blocks, + const CellCounter_T & cellCounter, const FluidCellCounter_T & fluidCellCounter, + const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), + const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ); + + void refresh(); + + void logResultOnRoot( const uint_t timeSteps, const double time ) const + { + WALBERLA_LOG_RESULT_ON_ROOT( "Simulation performance:\n" << loggingString( timeSteps, time ) ) + } + + void logInfoOnRoot( const uint_t timeSteps, const double time ) const + { + WALBERLA_LOG_INFO_ON_ROOT( "Simulation performance:\n" << loggingString( timeSteps, time ) ) + } + + std::string loggingString( const uint_t timeSteps, const double time ) const; + + void getResultsForSQLOnRoot( std::map< std::string, int > & integerProperties, + std::map< std::string, double > & realProperties, + std::map< std::string, std::string > & stringProperties, + const uint_t timeSteps, const double time ); + + static int processes() { return mpi::MPIManager::instance()->numProcesses(); } + + int threads() const { return processes() * threadsPerProcess_; } + int cores() const { return ( threadsPerCore_ == 0 ) ? 0 : ( threads() / threadsPerCore_ ); } + + uint64_t allFineCells() const + { + uint64_t c( uint64_t(0) ); + for( uint_t i = uint_t(0); i < levels_; ++i ) + c += cells_.numberOfCells(i) * uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) ); + return c; + } + + double mlups( const uint_t timeSteps, const double time ) const + { + double m( 0.0 ); + for( uint_t i = uint_t(0); i < levels_; ++i ) + m += double_c( timeSteps * math::uintPow2(i) ) * double_c( cells_.numberOfCells(i) ); + return m / ( time * 1000000.0 ); + } + + double mlupsPerProcess( const uint_t timeSteps, const double time ) const + { + return mlups( timeSteps, time ) / processes(); + } + + double mlupsPerCore( const uint_t timeSteps, const double time ) const + { + return ( cores() == 0 ) ? 0.0 : ( mlups( timeSteps, time ) / cores() ); + } + + double vMlups( const uint_t timeSteps, const double time ) const + { + double m( 0.0 ); + for( uint_t i = uint_t(0); i < levels_; ++i ) + m += double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) * + double_c( uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) ) * cells_.numberOfCells(i) ); + return m / ( time * 1000000.0 ); + } + + double vMlupsPerProcess( const uint_t timeSteps, const double time ) const + { + return vMlups( timeSteps, time ) / processes(); + } + + double vMlupsPerCore( const uint_t timeSteps, const double time ) const + { + return ( cores() == 0 ) ? 0.0 : ( vMlups( timeSteps, time ) / cores() ); + } + + double mflups( const uint_t timeSteps, const double time ) const + { + double m( 0.0 ); + for( uint_t i = uint_t(0); i < levels_; ++i ) + m += double_c( timeSteps * math::uintPow2(i) ) * double_c( fluidCells_.numberOfCells(i) ); + return m / ( time * 1000000.0 ); + } + + double mflupsPerProcess( const uint_t timeSteps, const double time ) const + { + return mflups( timeSteps, time ) / processes(); + } + + double mflupsPerCore( const uint_t timeSteps, const double time ) const + { + return ( cores() == 0 ) ? 0.0 : ( mflups( timeSteps, time ) / cores() ); + } + + double vMflups( const uint_t timeSteps, const double time ) const + { + double m( 0.0 ); + for( uint_t i = uint_t(0); i < levels_; ++i ) + m += double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) * + double_c( uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) ) * fluidCells_.numberOfCells(i) ); + return m / ( time * 1000000.0 ); + } + + double vMflupsPerProcess( const uint_t timeSteps, const double time ) const + { + return vMflups( timeSteps, time ) / processes(); + } + + double vMflupsPerCore( const uint_t timeSteps, const double time ) const + { + return ( cores() == 0 ) ? 0.0 : ( vMflups( timeSteps, time ) / cores() ); + } + + static double timeStepsPerSecond( const uint_t timeSteps, const double time ) { return double_c( timeSteps ) / time; } + + double fineTimeStepsPerSecond( const uint_t timeSteps, const double time ) const + { + return double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) / time; + } + +private: + + int threadsPerProcess_{ 1 }; + int threadsPerCore_{ 0 }; + + weak_ptr< StructuredBlockStorage > blocks_; + uint_t levels_; + + CellCounter_T cells_; + FluidCellCounter_T fluidCells_; + +}; // class PerformanceEvaluationBase + + + +//********************************************************************************************************************** +/*! +* \brief Class for evaluating the performance of LBM simulations using fields +* +* Assumes that in-between creating an object of this class and calling any of the member functions the number of cells +* and the number of fluid cells do not change! For simulations with static geometry, this is always the case. +*/ +//********************************************************************************************************************** +template< typename FlagField_T > +class PerformanceEvaluation : public PerformanceEvaluationBase< field::CellCounter< FlagField_T >, field::CellCounter< FlagField_T > > +{ +public: + PerformanceEvaluation( const weak_ptr< StructuredBlockStorage > & blocks, + const ConstBlockDataID & flagFieldId, const Set< FlagUID > & fluid, + const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), + const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) + : PerformanceEvaluationBase< field::CellCounter< FlagField_T >, field::CellCounter< FlagField_T > >( + blocks, + field::CellCounter< FlagField_T >( blocks, flagFieldId, Set< FlagUID >::emptySet(), requiredSelectors, incompatibleSelectors ), + field::CellCounter< FlagField_T >( blocks, flagFieldId, fluid, requiredSelectors, incompatibleSelectors ), + requiredSelectors, incompatibleSelectors ) + { + } +}; + + +template< typename CellCounter_T, typename FluidCellCounter_T > +PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::PerformanceEvaluationBase( + const weak_ptr< StructuredBlockStorage > & blocks, + const CellCounter_T & cellCounter, const FluidCellCounter_T & fluidCellCounter, + const Set<SUID> & /*requiredSelectors*/, const Set<SUID> & /*incompatibleSelectors*/ ) + : blocks_( blocks ), + cells_( cellCounter ), + fluidCells_( fluidCellCounter ) +{ +#ifdef _OPENMP + if( std::getenv( "OMP_NUM_THREADS" ) == NULL ) + WALBERLA_ABORT( "If you are using a version of the program that was compiled with OpenMP you have to " + "specify the environment variable \'OMP_NUM_THREADS\' accordingly!" ); + threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) ); +#endif + + if( std::getenv( "THREADS_PER_CORE" ) ) + threadsPerCore_ = std::atoi( std::getenv( "THREADS_PER_CORE" ) ); + + refresh(); +} + + + +template< typename CellCounter_T, typename FluidCellCounter_T > +void PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::refresh() +{ + auto blocks = blocks_.lock(); + WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'PerformanceEvaluation' for a block storage object that doesn't exist anymore" ) + + levels_ = blocks->getNumberOfLevels(); + + cells_(); + fluidCells_(); +} + + + +template< typename CellCounter_T, typename FluidCellCounter_T > +std::string PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::loggingString( const uint_t timeSteps, const double time ) const +{ + std::ostringstream oss; + + std::string na( "n/a *)" ); + + std::ostringstream threadsPerCoreString; + threadsPerCoreString << threadsPerCore_; + + std::ostringstream coresString; + coresString << cores(); + + oss << "- processes: " << processes() + << "\n- threads: " << threads() << " (threads per process = " << threadsPerProcess_ + << ", threads per core = " << ( ( threadsPerCore_ == 0 ) ? na : threadsPerCoreString.str() ) << ")" + << "\n- cores: " << ( ( threadsPerCore_ == 0 ) ? na : coresString.str() ) + << "\n- time steps: " << timeSteps; + + if( levels_ > uint_t(1) ) + { + oss << " (on the coarsest grid, " << ( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) << " on the finest grid)"; + } + + oss << "\n- time: " << time << " sec" + << "\n- cells: " << cells_.numberOfCells(); + + if( levels_ > uint_t(1) ) + { + oss << " (" << allFineCells() << " if everything were fine -> data reduction by factor of " + << ( real_c( allFineCells() ) / real_c( cells_.numberOfCells() ) ) << ")"; + } + + oss << "\n- fluid cells: " << fluidCells_.numberOfCells() << " (" + << ( real_c(100) * real_c( fluidCells_.numberOfCells() ) / real_c( cells_.numberOfCells() ) ) << " % of all cells)"; + + if( levels_ > uint_t(1) ) + { + oss << "\n- distribution of cells to different grid levels:"; + for( uint_t i = uint_t(0); i < levels_; ++i ) + oss << "\n + level " << i <<": " << cells_.numberOfCells(i) << " cells (" << fluidCells_.numberOfCells(i) << " fluid cells = " + << ( real_c(100) * real_c( fluidCells_.numberOfCells(i) ) / real_c( cells_.numberOfCells(i) ) ) + << " % of all cells on this level)"; + } + + std::ostringstream mlupsPerCoreString; + mlupsPerCoreString << mlupsPerCore( timeSteps, time ); + + std::ostringstream mflupsPerCoreString; + mflupsPerCoreString << mflupsPerCore( timeSteps, time ); + + oss << "\n- performance: " << mlups( timeSteps, time ) << " MLUPS (million lattice cell updates per second)" + << "\n " << mlupsPerProcess( timeSteps, time ) << " MLUPS / process" + << "\n " << ( ( threadsPerCore_ == 0 ) ? na : mlupsPerCoreString.str() ) << " MLUPS / core" + << "\n " << mflups( timeSteps, time ) << " MFLUPS (million fluid lattice cell updates per second)" + << "\n " << mflupsPerProcess( timeSteps, time ) << " MFLUPS / process" + << "\n " << ( ( threadsPerCore_ == 0 ) ? na : mflupsPerCoreString.str() ) << " MFLUPS / core" + << "\n " << timeStepsPerSecond( timeSteps, time ) << " time steps / second"; + + if( levels_ > uint_t(1) ) + { + std::ostringstream vMlupsPerCoreString; + vMlupsPerCoreString << vMlupsPerCore( timeSteps, time ); + + std::ostringstream vMflupsPerCoreString; + vMflupsPerCoreString << vMflupsPerCore( timeSteps, time ); + + oss << "\n- 'virtual' performance (if everything were fine): " << vMlups( timeSteps, time ) << " MLUPS (million lattice cell updates per second)" + << "\n " << vMlupsPerProcess( timeSteps, time ) << " MLUPS / process" + << "\n " << ( ( threadsPerCore_ == 0 ) ? na : vMlupsPerCoreString.str() ) << " MLUPS / core" + << "\n " << vMflups( timeSteps, time ) << " MFLUPS (million fluid lattice cell updates per second)" + << "\n " << vMflupsPerProcess( timeSteps, time ) << " MFLUPS / process" + << "\n " << ( ( threadsPerCore_ == 0 ) ? na : vMflupsPerCoreString.str() ) << " MFLUPS / core" + << "\n " << fineTimeStepsPerSecond( timeSteps, time ) << " fine time steps / second"; + } + + oss << "\n- build / run information:" + << "\n + host machine: " << getHostName() + << "\n + build machine: " << WALBERLA_BUILD_MACHINE + << "\n + git SHA1: " << WALBERLA_GIT_SHA1 + << "\n + build type: " << WALBERLA_BUILD_TYPE + << "\n + compiler flags: " << WALBERLA_COMPILER_FLAGS; + + if( threadsPerCore_ == 0 ) + oss << "\n\n *) only available if environment variable 'THREADS_PER_CORE' is set"; + + return oss.str(); +} + + + +template< typename CellCounter_T, typename FluidCellCounter_T > +void PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::getResultsForSQLOnRoot( std::map< std::string, int > & integerProperties, + std::map< std::string, double > & realProperties, + std::map< std::string, std::string > & stringProperties, + const uint_t timeSteps, const double time ) +{ + WALBERLA_NON_ROOT_SECTION() + { + return; + } + + integerProperties[ "levels" ] = int_c( levels_ ); + integerProperties[ "processes" ] = processes(); + integerProperties[ "threads" ] = threads(); + integerProperties[ "cores" ] = cores(); + integerProperties[ "threadsPerProcess" ] = threadsPerProcess_; + integerProperties[ "threadsPerCore" ] = threadsPerCore_; + + integerProperties[ "timeSteps" ] = int_c( timeSteps ); + if( levels_ > uint_t(1) ) + integerProperties[ "fineTimeSteps" ] = int_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ); + + realProperties[ "time" ] = real_c( time ); + + realProperties[ "cells" ] = real_c( cells_.numberOfCells() ); + if( levels_ > uint_t(1) ) + realProperties[ "refinementCellsReduction" ] = real_c( allFineCells() ) / real_c( cells_.numberOfCells() ); + realProperties[ "fluidCells" ] = real_c( fluidCells_.numberOfCells() ); + + if( levels_ > uint_t(1) ) + { + for( uint_t i = uint_t(0); i < levels_; ++i ) + { + std::ostringstream cells_i; + std::ostringstream fluidCells_i; + + cells_i << "cells_" << i; + fluidCells_i << "fluidCells_" << i; + + realProperties[ cells_i.str() ] = real_c( cells_.numberOfCells(i) ); + realProperties[ fluidCells_i.str() ] = real_c( fluidCells_.numberOfCells(i) ); + } + } + + realProperties[ "MLUPS" ] = double_c( mlups( timeSteps, time ) ); + realProperties[ "MLUPS_process" ] = double_c( mlupsPerProcess( timeSteps, time ) ); + realProperties[ "MLUPS_core" ] = double_c( mlupsPerCore( timeSteps, time ) ); + realProperties[ "MFLUPS" ] = double_c( mflups( timeSteps, time ) ); + realProperties[ "MFLUPS_process" ] = double_c( mflupsPerProcess( timeSteps, time ) ); + realProperties[ "MFLUPS_core" ] = double_c( mflupsPerCore( timeSteps, time ) ); + realProperties[ "timeStepsPerSecond" ] = double_c( timeStepsPerSecond( timeSteps, time ) ); + + if( levels_ > uint_t(1) ) + { + realProperties[ "vMLUPS" ] = double_c( vMlups( timeSteps, time ) ); + realProperties[ "vMLUPS_process" ] = double_c( vMlupsPerProcess( timeSteps, time ) ); + realProperties[ "vMLUPS_core" ] = double_c( vMlupsPerCore( timeSteps, time ) ); + realProperties[ "vMFLUPS" ] = double_c( vMflups( timeSteps, time ) ); + realProperties[ "vMFLUPS_process" ] = double_c( vMflupsPerProcess( timeSteps, time ) ); + realProperties[ "vMFLUPS_core" ] = double_c( vMflupsPerCore( timeSteps, time ) ); + realProperties[ "fineTimeStepsPerSecond" ] = double_c( fineTimeStepsPerSecond( timeSteps, time ) ); + } + + stringProperties[ "hostMachine" ] = std::string( getHostName() ); + stringProperties[ "buildMachine" ] = std::string( WALBERLA_BUILD_MACHINE ); + stringProperties[ "gitVersion" ] = std::string( WALBERLA_GIT_SHA1 ); + stringProperties[ "buildType" ] = std::string( WALBERLA_BUILD_TYPE ); + stringProperties[ "compilerFlags" ] = std::string( WALBERLA_COMPILER_FLAGS ); +} + +} // namespace walberla::lbm_generated diff --git a/src/lbm_generated/field/AddToStorage.h b/src/lbm_generated/field/AddToStorage.h new file mode 100644 index 0000000000000000000000000000000000000000..afb86819931238443443f3095f73880aec401d36 --- /dev/null +++ b/src/lbm_generated/field/AddToStorage.h @@ -0,0 +1,207 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AddToStorage.h +//! \ingroup lbm_generated +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "PdfField.h" +#include "core/debug/CheckFunctions.h" +#include "core/debug/Debug.h" +#include "core/uid/SUID.h" +#include "field/blockforest/BlockDataHandling.h" + +namespace walberla::lbm_generated { + +namespace internal { + +template< typename LatticeStorageSpecification_T > +class PdfFieldHandling : public field::BlockDataHandling< PdfField<LatticeStorageSpecification_T>, + LatticeStorageSpecification_T::Stencil::D == 2 > +{ +public: + + using PdfField_T = PdfField<LatticeStorageSpecification_T>; + using Base_T = field::BlockDataHandling<PdfField_T, LatticeStorageSpecification_T::Stencil::D == 2>; + + PdfFieldHandling( const weak_ptr< StructuredBlockStorage > & blocks, const LatticeStorageSpecification_T & storageSpecification, + const uint_t nrOfGhostLayers, const field::Layout & layout, const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr ) : + blocks_( blocks ), storageSpecification_( storageSpecification ), + nrOfGhostLayers_( nrOfGhostLayers ), layout_( layout ), alloc_( alloc ){} + + inline void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override + { + Base_T::serialize( block, id, buffer ); + } + + void serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child ) override + { + Base_T::serializeCoarseToFine( block, id, buffer, child ); + } + + void serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override + { + Base_T::serializeFineToCoarse( block, id, buffer ); + } + + void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override + { + Base_T::deserialize( block, id, buffer ); + } + + void deserializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override + { + Base_T::deserializeCoarseToFine( block, id, buffer ); + } + + void deserializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer, const uint_t child ) override + { + Base_T::deserializeFineToCoarse( block, id, buffer, child ); + } + +protected: + + PdfField<LatticeStorageSpecification_T> * allocate( IBlock * const block ) override + { + return allocateDispatch( block ); + } + + PdfField<LatticeStorageSpecification_T> * reallocate( IBlock * const block ) override + { + return allocateDispatch( block ); + } + +private: + + + PdfField<LatticeStorageSpecification_T> * allocateDispatch( IBlock * const block ) + { + WALBERLA_ASSERT_NOT_NULLPTR( block ) + + auto blocks = blocks_.lock(); + WALBERLA_CHECK_NOT_NULLPTR( blocks ) + + return new PdfField_T( blocks->getNumberOfXCells( *block ), blocks->getNumberOfYCells( *block ), blocks->getNumberOfZCells( *block ), + storageSpecification_, nrOfGhostLayers_, layout_, alloc_ ); + } + + weak_ptr< StructuredBlockStorage > blocks_; + LatticeStorageSpecification_T storageSpecification_; + + uint_t nrOfGhostLayers_; + field::Layout layout_; + shared_ptr< field::FieldAllocator<real_t> > alloc_; + +}; // class PdfFieldHandling + +} // namespace internal + + + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const uint_t ghostLayers, + const field::Layout & layout = field::fzyx, + const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), + const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(), + const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) +{ + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const field::Layout & layout = field::fzyx, + const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), + const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(), + const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) +{ + auto ghostLayers = uint_c(1); + + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), + const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(), + const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) +{ + auto ghostLayers = uint_c(1); + auto layout = field::fzyx; + + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) +{ + auto ghostLayers = uint_c(1); + auto layout = field::fzyx; + auto requiredSelectors = Set<SUID>::emptySet(); + auto incompatibleSelectors = Set<SUID>::emptySet(); + + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const field::Layout & layout = field::fzyx, + const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) +{ + auto ghostLayers = uint_c(1); + auto requiredSelectors = Set<SUID>::emptySet(); + auto incompatibleSelectors = Set<SUID>::emptySet(); + + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + +template< typename LatticeStorageSpecification_T, typename BlockStorage_T > +BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier, + const LatticeStorageSpecification_T & storageSpecification, + const uint_t ghostLayers, + const field::Layout & layout, + const shared_ptr< field::FieldAllocator<real_t> > alloc) +{ + auto requiredSelectors = Set<SUID>::emptySet(); + auto incompatibleSelectors = Set<SUID>::emptySet(); + + return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >( + blocks, storageSpecification, ghostLayers, layout, alloc ), + identifier, requiredSelectors, incompatibleSelectors ); +} + + +} // namespace walberla::lbm_generated diff --git a/src/lbm_generated/field/CMakeLists.txt b/src/lbm_generated/field/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..63bc11c8f920acc3e4c244488d72899fd7a24245 --- /dev/null +++ b/src/lbm_generated/field/CMakeLists.txt @@ -0,0 +1,5 @@ +target_sources( lbm_generated + PRIVATE + AddToStorage.h + PdfField.h + ) \ No newline at end of file diff --git a/src/lbm_generated/field/PdfField.h b/src/lbm_generated/field/PdfField.h new file mode 100644 index 0000000000000000000000000000000000000000..6e6b7ee88fd5e9ee0be1dbfb46da6d6e524d5536 --- /dev/null +++ b/src/lbm_generated/field/PdfField.h @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file PdfField.h +//! \ingroup lbm_generated +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "field/GhostLayerField.h" +#include "field/SwapableCompare.h" + + +namespace walberla::lbm_generated { + +template< typename LatticeStorageSpecification_T > +class PdfField : public GhostLayerField< real_t, LatticeStorageSpecification_T::Stencil::Size > +{ +public: + + //** Type Definitions ********************************************************************************************** + /*! \name Type Definitions */ + //@{ + using LatticeStorageSpecification = LatticeStorageSpecification_T; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + + using value_type = typename GhostLayerField<real_t, Stencil::Size>::value_type; + + using Ptr = typename GhostLayerField<real_t, Stencil::Size>::Ptr; + using ConstPtr = typename GhostLayerField<real_t, Stencil::Size>::ConstPtr; + //@} + //******************************************************************************************************************* + + PdfField( const uint_t _xSize, const uint_t _ySize, const uint_t _zSize, + const LatticeStorageSpecification_T & storageSpecification, + const uint_t ghostLayers = uint_t(1), const field::Layout & _layout = field::zyxf, + const shared_ptr< field::FieldAllocator<real_t> > & alloc = shared_ptr< field::FieldAllocator<real_t> >() ); + + ~PdfField() override = default; + + inline PdfField * clone() const; + inline PdfField * cloneUninitialized() const; + inline PdfField * cloneShallowCopy() const; + + + ///////////////////////////////////////////////// + // Access functions (with stencil::Direction!) // + ///////////////////////////////////////////////// + + using GhostLayerField< real_t, Stencil::Size >::get; + + real_t & get( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) { return get( x, y, z, Stencil::idx[d] ); } + const real_t & get( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) const { return get( x, y, z, Stencil::idx[d] ); } + real_t & get( const Cell & c, stencil::Direction d ) { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); } + const real_t & get( const Cell & c, stencil::Direction d ) const { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); } + + using GhostLayerField< real_t, Stencil::Size >::operator(); + + real_t & operator()( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) { return get( x, y, z, Stencil::idx[d] ); } + const real_t & operator()( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) const { return get( x, y, z, Stencil::idx[d] ); } + real_t & operator()( const Cell & c, stencil::Direction d ) { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); } + const real_t & operator()( const Cell & c, stencil::Direction d ) const { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); } + + +protected: + //** Shallow Copy *************************************************************************************************** + /*! \name Shallow Copy */ + //@{ + inline PdfField( const PdfField< LatticeStorageSpecification_T > & other ); + Field< real_t, Stencil::Size > * cloneShallowCopyInternal() const override { return new PdfField< LatticeStorageSpecification_T >( *this ); } + //@} + //******************************************************************************************************************* + + LatticeStorageSpecification_T storageSpecification_; +}; + + + +template< typename LatticeStorageSpecification_T > +PdfField< LatticeStorageSpecification_T >::PdfField( const uint_t _xSize, const uint_t _ySize, const uint_t _zSize, + const LatticeStorageSpecification_T & storageSpecification, + const uint_t ghostLayers, const field::Layout & _layout, + const shared_ptr< field::FieldAllocator<real_t> > & alloc ) : + + GhostLayerField< real_t, Stencil::Size >( _xSize, _ySize, _zSize, ghostLayers, _layout, alloc ), + storageSpecification_( storageSpecification ) + +{ +#ifdef _OPENMP + // take care of proper thread<->memory assignment (first-touch allocation policy !) + this->setWithGhostLayer( real_t(0) ); +#endif + this->setWithGhostLayer( real_t(0) ); +} + + + +template< typename LatticeStorageSpecification_T > +inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::clone() const +{ + return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::clone() ); +} + +template< typename LatticeStorageSpecification_T > +inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::cloneUninitialized() const +{ + return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::cloneUninitialized() ); +} + +template< typename LatticeStorageSpecification_T > +inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::cloneShallowCopy() const +{ + return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::cloneShallowCopy() ); +} + +template< typename LatticeStorageSpecification_T > +inline PdfField< LatticeStorageSpecification_T >::PdfField( const PdfField< LatticeStorageSpecification_T > & other ) + : GhostLayerField< real_t, Stencil::Size >::GhostLayerField( other ) +{ +} + +} // namespace lbm diff --git a/src/lbm_generated/gpu/AddToStorage.h b/src/lbm_generated/gpu/AddToStorage.h new file mode 100644 index 0000000000000000000000000000000000000000..ef8f28409709ad37244276e3b68269d0edcf19da --- /dev/null +++ b/src/lbm_generated/gpu/AddToStorage.h @@ -0,0 +1,105 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AddToStorage.h +//! \ingroup lbm_generated +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/debug/CheckFunctions.h" +#include "core/debug/Debug.h" +#include "core/uid/SUID.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/FieldCopy.h" + +#include "field/blockforest/BlockDataHandling.h" + +#include "GPUPdfField.h" + +namespace walberla::lbm_generated +{ + +namespace internal +{ + +template< typename LatticeStorageSpecification_T> +GPUPdfField< LatticeStorageSpecification_T > * createGPUPdfField( const IBlock * const block, + const StructuredBlockStorage * const bs, + const LatticeStorageSpecification_T& storageSpecification, + const uint_t ghostLayers, + const field::Layout & layout, + const bool usePitchedMem ) +{ + using GPUField_T = GPUPdfField< LatticeStorageSpecification_T >; + + auto gpuField = new GPUField_T(bs->getNumberOfXCells( *block ), + bs->getNumberOfYCells( *block ), + bs->getNumberOfZCells( *block ), + storageSpecification, ghostLayers, + layout, usePitchedMem); + + return gpuField; +} + +template< typename Field_T, typename LatticeStorageSpecification_T > +GPUPdfField< LatticeStorageSpecification_T >* + createGPUPdfFieldFromCPUPdfField(const IBlock* const block, const StructuredBlockStorage* const, + const LatticeStorageSpecification_T& storageSpecification, + ConstBlockDataID cpuFieldID, const bool usePitchedMem, const bool copyCPUField = true) +{ + using GPUField_T = GPUPdfField< LatticeStorageSpecification_T >; + + const Field_T* f = block->getData< Field_T >(cpuFieldID); + + auto gpuField = new GPUField_T(f->xSize(), f->ySize(), f->zSize(), storageSpecification, f->nrOfGhostLayers(), + f->layout(), usePitchedMem); + + if (copyCPUField) + gpu::fieldCpy(*gpuField, *f); + + return gpuField; +} + +} // namespace internal + +template< typename GPUField_T, typename LatticeStorageSpecification_T > +BlockDataID addGPUPdfFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs, + const std::string & identifier, + const LatticeStorageSpecification_T& storageSpecification, + const Layout layout = fzyx, + const uint_t nrOfGhostLayers = 1, + const bool usePitchedMem = true ) +{ + + auto func = std::bind(internal::createGPUPdfField< LatticeStorageSpecification_T >, + std::placeholders::_1, std::placeholders::_2, storageSpecification, nrOfGhostLayers, layout, usePitchedMem); + return bs->addStructuredBlockData< GPUPdfField< LatticeStorageSpecification_T > >(func, identifier); +} + +template< typename Field_T, typename LatticeStorageSpecification_T > +BlockDataID addGPUPdfFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs, ConstBlockDataID cpuFieldID, + const LatticeStorageSpecification_T& storageSpecification, + const std::string& identifier, const bool usePitchedMem = true, const bool copyCPUField = true) +{ + auto func = std::bind(internal::createGPUPdfFieldFromCPUPdfField< Field_T, LatticeStorageSpecification_T >, + std::placeholders::_1, std::placeholders::_2, storageSpecification, cpuFieldID, usePitchedMem, copyCPUField); + return bs->addStructuredBlockData< GPUPdfField< LatticeStorageSpecification_T > >(func, identifier); +} + +} // namespace walberla::lbm_generated \ No newline at end of file diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h new file mode 100644 index 0000000000000000000000000000000000000000..4a082d34196c1b7a473956f6f805a2a09b535eb3 --- /dev/null +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h @@ -0,0 +1,108 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicRecursiveTimeStepGPU.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "gpu/GPUWrapper.h" +#include "gpu/communication/NonUniformGPUScheme.h" + +#include "timeloop/SweepTimeloop.h" + +#include <utility> + +#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h" + +namespace walberla +{ + +using gpu::communication::NonUniformGPUScheme; + +namespace lbm_generated +{ + +/** + * + * @tparam LatticeStorageSpecification_T Generated storage specification + * @tparam SweepCollection_T LBM SweepCollection (must be able to call stream, collide, streamCollide and + * streamOnlyNoAdvancement) + * @tparam BoundaryCollection_T LBM Boundary collection (Functor that runs all boundary kernels at call) + */ +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +class BasicRecursiveTimeStepGPU +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + + using CommScheme = gpu::communication::NonUniformGPUScheme< CommunicationStencil >; + using PackInfo = lbm_generated::NonuniformGeneratedGPUPdfPackInfo< PdfField_T >; + + BasicRecursiveTimeStepGPU(std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID& pdfFieldId, + SweepCollection_T& sweepCollection, BoundaryCollection_T& boundaryCollection, + std::shared_ptr< CommScheme >& commScheme, std::shared_ptr< PackInfo >& pdfFieldPackInfo) + : sbfs_(sbfs), pdfFieldId_(pdfFieldId), pdfFieldPackInfo_(pdfFieldPackInfo), commScheme_(commScheme), + sweepCollection_(sweepCollection), boundaryCollection_(boundaryCollection) + { +#ifndef NDEBUG + for (auto& block : *sbfs) + WALBERLA_ASSERT(block.isDataOfType< PdfField_T >(pdfFieldId_), + "Template parameter PdfField_T is of different type than BlockDataID pdfFieldId that is " + "provided as constructor argument") +#endif + maxLevel_ = sbfs->getDepth(); + + for (uint_t level = 0; level <= maxLevel_; level++) + { + std::vector< Block* > blocks; + sbfs->getBlocks(blocks, level); + blocks_.push_back(blocks); + } + }; + + ~BasicRecursiveTimeStepGPU() = default; + + void operator()() { timestep(0); }; + void addRefinementToTimeLoop(timeloop::SweepTimeloop& timeloop, uint_t level = 0); + void test(uint_t maxLevel, uint_t level = 0); + + private: + void timestep(uint_t level); + void ghostLayerPropagation(Block* block, gpuStream_t gpuStream); + std::function< void() > executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation = false); + + std::function< void() > executeBoundaryHandlingOnLevel(uint_t level); + + std::shared_ptr< StructuredBlockForest > sbfs_; + uint_t maxLevel_; + std::vector< std::vector< Block* > > blocks_; + + const BlockDataID pdfFieldId_; + std::shared_ptr< PackInfo > pdfFieldPackInfo_; + std::shared_ptr< CommScheme > commScheme_; + + SweepCollection_T& sweepCollection_; + BoundaryCollection_T& boundaryCollection_; +}; + +} // namespace lbm_generated +} // namespace walberla + +#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h" diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..f7c5b28789d0976061190fb5367d101579cf8ded --- /dev/null +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h @@ -0,0 +1,255 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicRecursiveTimeStep.impl.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "BasicRecursiveTimeStepGPU.h" + +namespace walberla { +namespace lbm_generated { + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::timestep(uint_t level) +{ + std::vector<Block *> blocks; + sbfs_->getBlocks(blocks, level); + + uint_t maxLevel = sbfs_->getDepth(); + + // 1.1 Collision + for(auto b: blocks){ + sweepCollection_.streamCollide(b); + } + + // 1.2 Recursive Descent + if(level < maxLevel){ + timestep(level + 1); + } + + // 1.3 Coarse to Fine Communication, receiving end + if(level != 0){ + commScheme_->communicateCoarseToFine(level); + } + + // 1.4 Equal-Level Communication + commScheme_->communicateEqualLevel(level); + + // 1.5 Boundary Handling and Coalescence Preparation + for(auto b : blocks){ + boundaryCollection_(b, nullptr); + if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b); + } + + // 1.6 Fine to Coarse Communication, receiving end + if(level < maxLevel){ + commScheme_->communicateFineToCoarse(level + 1); + } + + // Stop here if on coarsest level. + // Otherwise, continue to second subcycle. + if(level == 0) return; + + // 2.1 Collision and Ghost-Layer Propagation + for(auto b: blocks){ + ghostLayerPropagation(b); // GL-Propagation first without swapping arrays... + sweepCollection_.streamCollide(b); // then Stream-Collide on interior, and swap arrays + } + + // 2.2 Recursive Descent + if(level < maxLevel){ + timestep(level + 1); + } + + // 2.4 Equal-Level Communication + commScheme_->communicateEqualLevel(level); + + // 2.5 Boundary Handling and Coalescence Preparation + for(auto b : blocks){ + boundaryCollection_(b, nullptr); + if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b); + } + + // 2.6 Fine to Coarse Communication, receiving end + if(level < maxLevel){ + commScheme_->communicateFineToCoarse(level + 1); + } +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level) +{ + // 1.1 Collision + timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level)); + + // 1.2 Recursive Descent + if(level < maxLevel_){ + addRefinementToTimeLoop(timeloop, level + 1); + } + + // 1.3 Coarse to Fine Communication, receiving end + if(level != 0){ + timeloop.addFuncBeforeTimeStep(commScheme_->communicateCoarseToFineFunctor(level), "Refinement Cycle: communicate coarse to fine on level " + std::to_string(level)); + } + + // 1.4 Equal-Level Communication + timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + + // 1.5 Boundary Handling and Coalescence Preparation + timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 1.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_){ + timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + } + + // Stop here if on coarsest level. + // Otherwise, continue to second subcycle. + if(level == 0) return; + + // 2.1 Collision and Ghost-Layer Propagation + timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, true), "Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level)); + + // 2.2 Recursive Descent + if(level < maxLevel_) + addRefinementToTimeLoop(timeloop, level + 1); + + + // 2.4 Equal-Level Communication + timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + // 2.5 Boundary Handling and Coalescence Preparation + timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 2.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_) + timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + +} + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::test(uint_t maxLevel, uint_t level) +{ + // 1.1 Collision + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: streamCollide on level " + std::to_string(level)); + + // 1.2 Recursive Descent + if(level < maxLevel){ + test(maxLevel, level + 1); + } + + // 1.3 Coarse to Fine Communication, receiving end + if(level != 0){ + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate coarse to fine on level " + std::to_string(level)); + } + + // 1.4 Equal-Level Communication + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + + // 1.5 Boundary Handling and Coalescence Preparation + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 1.6 Fine to Coarse Communication, receiving end + if(level < maxLevel){ + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + } + + // Stop here if on coarsest level. + // Otherwise, continue to second subcycle. + if(level == 0) return; + + // 2.1 Collision and Ghost-Layer Propagation + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level)); + + // 2.2 Recursive Descent + if(level < maxLevel) + test(maxLevel, level + 1); + + + // 2.4 Equal-Level Communication + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + // 2.5 Boundary Handling and Coalescence Preparation + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 2.6 Fine to Coarse Communication, receiving end + if(level < maxLevel) + WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation) +{ + return [level, withGhostLayerPropagation, this]() + { + if (withGhostLayerPropagation) + { + for(auto b: blocks_[level]){ + ghostLayerPropagation(b, nullptr); + sweepCollection_.streamCollide(b, 0, nullptr); + } + } + else + { + for(auto b: blocks_[level]){ + sweepCollection_.streamCollide(b, 0, nullptr); + } + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + }; +} + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeBoundaryHandlingOnLevel(uint_t level) +{ + return [this, level]() { + for (auto b : blocks_[level]) + { + boundaryCollection_(b, nullptr); + if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b, nullptr); + } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + }; +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation( + Block * block, gpuStream_t gpuStream) +{ + auto pdfField = block->getData<PdfField_T>(pdfFieldId_); + + for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); + // Propagate on ghost layers shadowing coarse or no blocks + if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + CellInterval ci; + pdfField->getGhostRegion(*it, ci, 1); + sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci, gpuStream); + } + } +} + +} // namespace lbm_generated +} // namespace walberla diff --git a/src/lbm_generated/gpu/CMakeLists.txt b/src/lbm_generated/gpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f81e5f2b370d478473f4d02d3853c469c905799f --- /dev/null +++ b/src/lbm_generated/gpu/CMakeLists.txt @@ -0,0 +1,12 @@ +target_sources( lbm_generated + PRIVATE + AddToStorage.h + BasicRecursiveTimeStepGPU.h + BasicRecursiveTimeStepGPU.impl.h + GPUPdfField.h + NonuniformGPUCommData.h + NonuniformGPUCommData.impl.h + NonuniformGeneratedGPUPdfPackInfo.h + NonuniformGeneratedGPUPdfPackInfo.impl.h + UniformGeneratedGPUPdfPackInfo.h + ) \ No newline at end of file diff --git a/src/lbm_generated/gpu/GPUPdfField.h b/src/lbm_generated/gpu/GPUPdfField.h new file mode 100644 index 0000000000000000000000000000000000000000..1a9f59a116b8c4e7c5fcb4ebd817dcb5cad0a908 --- /dev/null +++ b/src/lbm_generated/gpu/GPUPdfField.h @@ -0,0 +1,66 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUPdfField.h +//! \ingroup lbm_generated +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "gpu/GPUField.h" + +using namespace walberla::gpu; + +namespace walberla::lbm_generated { + +template< typename LatticeStorageSpecification_T > +class GPUPdfField : public GPUField< real_t > +{ + public: + + //** Type Definitions ********************************************************************************************** + /*! \name Type Definitions */ + //@{ + using LatticeStorageSpecification = LatticeStorageSpecification_T; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + + using value_type = typename GPUField<real_t>::value_type; + //@} + //******************************************************************************************************************* + + GPUPdfField( uint_t _xSize, uint_t _ySize, uint_t _zSize, + const LatticeStorageSpecification_T & storageSpecification, + uint_t _nrOfGhostLayers, const Layout & _layout = zyxf, bool usePitchedMem = true ); + + + ~GPUPdfField() = default; + + protected: + LatticeStorageSpecification_T storageSpecification_; +}; + + + +template< typename LatticeStorageSpecification_T > +GPUPdfField< LatticeStorageSpecification_T >::GPUPdfField( uint_t _xSize, uint_t _ySize, uint_t _zSize, + const LatticeStorageSpecification_T & storageSpecification, + uint_t ghostLayers, const Layout & layout, bool usePitchedMem) : + GPUField< real_t>( _xSize, _ySize, _zSize, LatticeStorageSpecification_T::Stencil::Size, ghostLayers, layout, usePitchedMem ), storageSpecification_( storageSpecification ) +{ +} + +} // namespace lbm \ No newline at end of file diff --git a/src/lbm_generated/gpu/NonuniformGPUCommData.h b/src/lbm_generated/gpu/NonuniformGPUCommData.h new file mode 100644 index 0000000000000000000000000000000000000000..795a9bcb5868c156f8c42dd94057f36361ca1e3d --- /dev/null +++ b/src/lbm_generated/gpu/NonuniformGPUCommData.h @@ -0,0 +1,137 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGPUCommData.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/StructuredBlockForest.h" +#include "blockforest/BlockDataHandling.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/GPUField.h" +#include "gpu/FieldCopy.h" + +#include "domain_decomposition/IBlock.h" + +#include "field/FlagField.h" + +#include "lbm_generated/communication/NonuniformCommData.h" + +#include "stencil/Directions.h" + +#define USE_CELL_INTERVALS + +namespace walberla::lbm_generated { + +using PartialCoalescenceMaskFieldGPU = gpu::GPUField< uint32_t >; + +template< typename LatticeStorageSpecification_T > +class NonuniformGPUCommData +{ + private: + void registerFlags(); + void computeBitMask(); + void syncDataGPU(); + + public: + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + +#if defined(USE_CELL_INTERVALS) + NonuniformGPUCommData(IBlock* const block, uint_t xSize, uint_t ySize, uint_t zSize) + : block_(block), maskField_(xSize, ySize, zSize, 2), + maskFieldGPU_(xSize, ySize, zSize, 1, 2, field::fzyx), + interiorInterval(0, 0, 0, cell_idx_c(xSize) - 1, cell_idx_c(ySize) - 1, cell_idx_c(zSize) - 1) + { + registerFlags(); + computeBitMask(); + syncDataGPU(); + }; +#else + NonuniformCommData(IBlock* const block, const BlockDataID pdfFieldID, uint_t xSize, uint_t ySize, uint_t zSize) + : block_(block), pdfFieldID_(pdfFieldID), maskField_(xSize, ySize, zSize, 2) + { + registerFlags(); + computeBitMask(); + syncDataGPU(); + }; +#endif + + bool operator==(const NonuniformGPUCommData& other) { return this == &other; } + bool operator!=(const NonuniformGPUCommData& other) { return this != &other; } + + PartialCoalescenceMaskField& getMaskField() { return maskField_; } + const PartialCoalescenceMaskField& getMaskField() const { return maskField_; } + + PartialCoalescenceMaskFieldGPU& getMaskFieldGPU() { return maskFieldGPU_; } + const PartialCoalescenceMaskFieldGPU& getMaskFieldGPU() const { return maskFieldGPU_; } + + private: +#if defined(USE_CELL_INTERVALS) + void prepareIntervals(); + void setFlagOnInterval(const CellInterval & ci, const uint_t fIdx); +#else + void prepareFlags(); + void resetCornerSkippingOriginFlags(); +#endif + + void setupCornerSkippingOrigins(stencil::Direction commDir); + void setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir); + + bool haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir); + + const IBlock* const block_; + PartialCoalescenceMaskField maskField_; + PartialCoalescenceMaskFieldGPU maskFieldGPU_; + +#if defined(USE_CELL_INTERVALS) + const CellInterval interiorInterval; + std::vector< CellInterval > passThroughIntervals_; + std::vector< CellInterval > cornerSkippingOriginIntervals_; +#endif +}; + + +template< typename LatticeStorageSpecification_T > +class NonuniformGPUCommDataHandling + : public blockforest::AlwaysInitializeBlockDataHandling< NonuniformGPUCommData< LatticeStorageSpecification_T > > +{ + public: + using CommmData_T = NonuniformGPUCommData< LatticeStorageSpecification_T >; + + NonuniformGPUCommDataHandling(const weak_ptr< StructuredBlockForest >& blocks) + : blocks_(blocks){}; + + CommmData_T* initialize(IBlock* const block) override + { + WALBERLA_ASSERT_NOT_NULLPTR(block) + auto blocks = blocks_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(blocks) + + return new CommmData_T(block, blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block), + blocks->getNumberOfZCells(*block)); + } + + private: + const weak_ptr< StructuredBlockStorage > blocks_; +}; + +} // walberla::lbm_generated + +#include "lbm_generated/gpu/NonuniformGPUCommData.impl.h" diff --git a/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h b/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..47d6f033046b46d9d6156b6c91c0ffff6e82cf91 --- /dev/null +++ b/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h @@ -0,0 +1,322 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGPUCommData.impl.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/all.h" + +#include "lbm_generated/gpu/NonuniformGPUCommData.h" + +#include "stencil/Directions.h" + +#define IDX_FLAG(d) (1 << d) + +#if !defined(USE_CELL_INTERVALS) +#define INTERIOR_FLAG_BIT 29 +#define INTERIOR_FLAG (1 << INTERIOR_FLAG_BIT) + +#define PASS_THROUGH_FLAG_BIT 30 +#define PASS_THROUGH_FLAG (1 << PASS_THROUGH_FLAG_BIT) + +#define CORNER_SKIPPING_ORIGIN_FLAG_BIT 31 +#define CORNER_SKIPPING_ORIGIN_FLAG (1 << CORNER_SKIPPING_ORIGIN_FLAG_BIT) +#endif + +using namespace walberla::lbm_generated::util; + +namespace walberla::lbm_generated { + +/*********************************************************************************************************************** + * Bit Mask Computation * + **********************************************************************************************************************/ + +template< typename LatticeStorageSpecification_T > +void NonuniformGPUCommData< LatticeStorageSpecification_T >::registerFlags() +{ +#if !defined(USE_CELL_INTERVALS) + maskField_.registerFlag(FlagUID(true), INTERIOR_FLAG_BIT); + maskField_.registerFlag(FlagUID(true), PASS_THROUGH_FLAG_BIT); + maskField_.registerFlag(FlagUID(true), CORNER_SKIPPING_ORIGIN_FLAG_BIT); +#endif + + for(auto it = Stencil::beginNoCenter(); it != Stencil::end(); ++it){ + maskField_.registerFlag(FlagUID(true), Stencil::idx[*it]); + } +} + +#if defined(USE_CELL_INTERVALS) + +template< typename LatticeStorageSpecification_T > +inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::prepareIntervals() +{ + passThroughIntervals_.clear(); + const Block * b = dynamic_cast< const Block * >(block_); + + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + CellInterval ci; + maskField_.getGhostRegion(*commDir, ci, 2); + passThroughIntervals_.push_back(ci); + } + } +} + +template< typename LatticeStorageSpecification_T > +inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setFlagOnInterval(const CellInterval & ci, + const uint_t fIdx) +{ + for(auto c : ci){ + maskField_.addFlag(c, IDX_FLAG(fIdx)); + } +} + +#else + +/** + * Prepares the INTERIOR and PASS_THROUGH flags. + * Sets the domain interior to INTERIOR. Sets any ghost layers corresponding to a coarse block + * or no block to PASS_THROUGH. + */ +template< typename LatticeStorageSpecification_T > +void NonuniformCommData< LatticeStorageSpecification_T >::prepareFlags() +{ + const Block * b = dynamic_cast< const Block * >(block_); + + // Set interior to origin + for (auto it = maskField_.beginXYZ(); it != maskField_.end(); ++it) + { + maskField_.addFlag(it.cell(), INTERIOR_FLAG); + } + + // Set GLs to pass-through + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){ + maskField_.addFlag(it.cell(), PASS_THROUGH_FLAG); + } + } + } +} + +/** + * Resets the origin flag on any ghost layers. + */ +template< typename LatticeStorageSpecification_T > +inline void NonuniformCommData< LatticeStorageSpecification_T >::resetCornerSkippingOriginFlags() +{ + const Block * b = dynamic_cast< const Block * >(block_); + + // Remove origin flag from any ghost layers + for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir); + if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){ + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){ + maskField_.removeFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG); + } + } + } +} + +#endif + + +/** + * Determines whether the current block has the smallest BlockID among all fine blocks of a + * given intersection volume. + * @tparam LatticeStorageSpecification_T + * @param cornerDir + * @return + */ +template< typename LatticeStorageSpecification_T > +inline bool NonuniformGPUCommData< LatticeStorageSpecification_T >::haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir) +{ + const IBlockID& myId = block_->getId(); + const Block* b = dynamic_cast< const Block* >(block_); + return forEachSubdirectionCancel(cornerDir, [&](Vector3< cell_idx_t > dirVec) { + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]); + if (b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)) + { + if (b->getNeighbor(nSecIdx, 0).getId() < myId) return false; + } + return true; + }); +} + + +/** + * Sets up the feasible space for the given communication direction. + * Additionally to the field interior, marks every ghost layer slice corresponding to an adjacent coarse block, + * and the corresponding corner as feasible, if that corner also belongs to a coarse block and the current block + * has the smallest BlockID participating in the intersection. + * @param commDir A communication direction pointing toward an adjacent coarse block + */ +template< typename LatticeStorageSpecification_T > +inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setupCornerSkippingOrigins(stencil::Direction commDir) +{ +#if defined(USE_CELL_INTERVALS) + cornerSkippingOriginIntervals_.clear(); +#else + resetCornerSkippingOriginFlags(); +#endif + + const Block* b = dynamic_cast< const Block* >(block_); + Vector3<cell_idx_t> commDirVec(stencil::cx[commDir], stencil::cy[commDir], stencil::cz[commDir]); + + // Iterate all orthogonal comm directions + forEachOrthogonalDirection< CommunicationStencil >(commDirVec, [&](Vector3< cell_idx_t > toSourceVec) { + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(toSourceVec[0], toSourceVec[1], toSourceVec[2]); + // Find if there is a coarse block or no block at all in this neighborhood + // There are three possibilities: Coarse block, Same-level block or no block + // Finer block is not possible because of 2:1 balance + if (!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)) + { + // From this adjacent coarse block (or not-block, for boundary handling), corner skipping must be handled. + // Also, if there is no block, boundary handling in that region must be done on only + // one of the participating fine blocks. + Vector3< cell_idx_t > cornerDirVec = toSourceVec + commDirVec; + + // If the current block has the smallest participating ID... + if (haveSmallestIdInIntersection(cornerDirVec)) + { + const stencil::Direction toSourceDir = stencil::vectorToDirection(toSourceVec); + + // ... Mark source GL region as corner skipping origin. +#if defined(USE_CELL_INTERVALS) + CellInterval ci; + maskField_.getGhostRegion(toSourceDir, ci, 2); + cornerSkippingOriginIntervals_.push_back(ci); +#else + for (auto it = maskField_.beginGhostLayerOnlyXYZ(toSourceDir); it != maskField_.end(); ++it) + { + maskField_.addFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG); + } +#endif + } + } + }); +} + + +template< typename LatticeStorageSpecification_T > +inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir) +{ + uint_t fIdx = Stencil::idx[streamDir]; + Cell streamVec(stencil::cx[streamDir], stencil::cy[streamDir], stencil::cz[streamDir]); + +#if defined(USE_CELL_INTERVALS) + CellInterval commSliceInterval; + maskField_.getGhostRegion(commDir, commSliceInterval, 2); + + // Shift back once + commSliceInterval.shift(-streamVec); + + // Intersect with interior and set flag on intersection volume + CellInterval interiorIntersection(interiorInterval); + interiorIntersection.intersect(commSliceInterval); + if(!interiorIntersection.empty()){ + interiorIntersection.shift(streamVec); + setFlagOnInterval(interiorIntersection, fIdx); + } + + // Intersect with pass-through regions... + for(auto passThroughIntersection : std::as_const(passThroughIntervals_)){ + passThroughIntersection.intersect(commSliceInterval); + if(passThroughIntersection.empty()) continue; + + // ... shift back once more ... + passThroughIntersection.shift(-streamVec); + + // ... intersect with interior ... + interiorIntersection = interiorInterval; + interiorIntersection.intersect(passThroughIntersection); + if(!interiorIntersection.empty()){ + interiorIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z()); + setFlagOnInterval(interiorIntersection, fIdx); + } + + // ... and with corner-skipping origin regions + for(auto originIntersection : std::as_const(cornerSkippingOriginIntervals_)){ + originIntersection.intersect(passThroughIntersection); + if(!originIntersection.empty()){ + originIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z()); + setFlagOnInterval(originIntersection, fIdx); + } + } + } +#else + for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, commDir); it != maskField_.end(); ++it){ + Cell currentCell = it.cell(); + + // Shift back once + Cell shiftedCell = currentCell - streamVec; + + if (maskField_.isFlagSet(shiftedCell, INTERIOR_FLAG)){ + maskField_.addFlag(currentCell, IDX_FLAG(fIdx)); + } + else if (maskField_.isFlagSet(shiftedCell, PASS_THROUGH_FLAG)){ + // Shift back twice + shiftedCell -= streamVec; + if (maskField_.isPartOfMaskSet(shiftedCell, INTERIOR_FLAG | CORNER_SKIPPING_ORIGIN_FLAG)){ + maskField_.addFlag(currentCell, IDX_FLAG(fIdx)); + } + + } + // else continue; + } +#endif +} + +/** + * Computes the partial coalescence bit mask on the mask field. + * Assumes that all flags are already registered at the field, and that the field + * has been initialized to zero. + */ +template< typename LatticeStorageSpecification_T > +void NonuniformGPUCommData< LatticeStorageSpecification_T >::computeBitMask() +{ +#if defined(USE_CELL_INTERVALS) + prepareIntervals(); +#else + prepareFlags(); +#endif + + const Block* b = dynamic_cast< const Block* >(block_); + for(auto commIt = CommunicationStencil::beginNoCenter(); commIt != CommunicationStencil::end(); ++commIt){ + stencil::Direction commDir = *commIt; + const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(commDir); + if(b->neighborhoodSectionHasLargerBlock(nSecIdx)){ + setupCornerSkippingOrigins(commDir); + + for(uint_t streamDirIdx = 0; streamDirIdx < Stencil::d_per_d_length[commDir]; streamDirIdx++){ + stencil::Direction streamDir = Stencil::d_per_d[commDir][streamDirIdx]; + setupBitMaskSlice(commDir, streamDir); + } + } + } +} + +template< typename LatticeStorageSpecification_T > +void NonuniformGPUCommData< LatticeStorageSpecification_T >::syncDataGPU() +{ + gpu::fieldCpy(maskFieldGPU_, maskField_); +} +} // walberla::lbm_generated diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..d6ac87010a6889b899380514ec51d717159bd6f8 --- /dev/null +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h @@ -0,0 +1,332 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGeneratedGPUPdfPackInfo.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/communication/GeneratedNonUniformGPUPackInfo.h" + +#include "lbm_generated/gpu/NonuniformGPUCommData.h" +#include "lbm_generated/field/PdfField.h" + +namespace walberla::lbm_generated +{ +using stencil::Direction; + +namespace internal +{ +/* + * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to + * kernels generated for in-place streaming patterns. The generated code should not be templated. + */ +template< typename PdfField_T, bool inplace > +class NonuniformGPUPackingKernelsWrapper +{ + public: + void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const = 0; + void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const = 0; + void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, gpuStream_t stream = nullptr) const = 0; + + void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0; + + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, + stencil::Direction dir, gpuStream_t stream = nullptr) const = 0; + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci, + unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir) const = 0; + void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + + uint_t size(CellInterval ci, Direction dir) const = 0; + uint_t size(CellInterval ci) const = 0; + uint_t redistributeSize(CellInterval ci) const = 0; + uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const = 0; +}; + +/* + * Template Specialization for two-fields patterns, with trivial method wrappers. + */ +template< typename PdfField_T > +class NonuniformGPUPackingKernelsWrapper< PdfField_T, false > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const + { + kernels_.packAll(srcField, ci, outBuffer, stream); + } + + void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const + { + kernels_.unpackAll(dstField, ci, inBuffer, stream); + } + + void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, gpuStream_t stream = nullptr) const + { + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, stream); + } + + void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.packDirection(srcField, ci, outBuffer, dir, stream); + } + + void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.unpackDirection(dstField, ci, inBuffer, dir, stream); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream); + } + + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, + stencil::Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, stream); + } + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci, + unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, stream); + } + + void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, stream); + } + + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir, gpuStream_t stream = nullptr) const + { + kernels_.zeroCoalescenceRegion(dstField, ci, dir, stream); + } + + uint_t size(CellInterval ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval ci) const { return kernels_.size(ci); } + uint_t redistributeSize(CellInterval ci) const { return kernels_.redistributeSize(ci); } + uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const + { + return kernels_.partialCoalescenceSize(ci, dir); + } + + private: + PackingKernels_T kernels_; +}; + +/* + * Template Specialization for in-place patterns, extracting the timestep from the lattice model. + */ +template< typename PdfField_T > +class NonuniformGPUPackingKernelsWrapper< PdfField_T, true > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packAll(srcField, ci, outBuffer, timestep, stream); + } + + void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackAll(dstField, ci, inBuffer, timestep, stream); + } + + void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, gpuStream_t stream = nullptr) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep, stream); + } + + void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packDirection(srcField, ci, outBuffer, dir, timestep, stream); + } + + void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep, stream); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream); + } + + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, + stencil::Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep, stream); + } + + void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci, + unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep, stream); + } + + void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.zeroCoalescenceRegion(dstField, ci, dir, timestep, stream); + } + + void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, timestep, stream); + } + + uint_t size(CellInterval ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval ci) const { return kernels_.size(ci); } + uint_t redistributeSize(CellInterval ci) const { return kernels_.redistributeSize(ci); } + uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const + { + return kernels_.partialCoalescenceSize(ci, dir); + } + + private: + PackingKernels_T kernels_; +}; +} // namespace internal + +/*********************************************************************************************************************** + * Class Declaration * + **********************************************************************************************************************/ + +template< typename PdfField_T > +class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUniformGPUPackInfo +{ + public: + using VoidFunction = std::function< void(gpuStream_t) >; + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + using CommData_T = NonuniformGPUCommData< LatticeStorageSpecification_T >; + + NonuniformGeneratedGPUPdfPackInfo(const BlockDataID pdfFieldID, const BlockDataID commDataID) + : pdfFieldID_(pdfFieldID), commDataID_(commDataID){}; + + bool constantDataExchange() const override { return true; }; + bool threadsafeReceiving() const override { return false; }; + + /// Equal Level + void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer) override; + void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir, + gpuStream_t stream) override; + void getLocalEqualLevelCommFunction(std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver, + stencil::Direction dir) override; + + /// Coarse to Fine + void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir, + GpuBuffer_T& buffer) override; + void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override; + void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, + GpuBuffer_T& buffer, gpuStream_t stream) override; + void getLocalCoarseToFineCommFunction(std::vector< VoidFunction >& commFunctions, const Block* coarseSender, + Block* fineReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override; + + /// Fine to Coarse + void prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream = nullptr); + void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir, + GpuBuffer_T& buffer) override; + + void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override; + void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir, + GpuBuffer_T& buffer, gpuStream_t stream) override; + void getLocalFineToCoarseCommFunction(std::vector< VoidFunction >& commFunctions, const Block* fineSender, + Block* coarseReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override; + + uint_t sizeEqualLevelSend(const Block* sender, stencil::Direction dir) override; + uint_t sizeCoarseToFineSend(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir) override; + uint_t sizeFineToCoarseSend(const Block* fineSender, stencil::Direction dir) override; + + protected: + void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer) const override; + + void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, + GpuBuffer_T& buffer) const override; + void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, + GpuBuffer_T& buffer) const override; + + private: + /// Helper Functions + /// As in PdfFieldPackInfo.h + Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const; + bool areNeighborsInDirection(const Block* block, const BlockID& neighborID, + const Vector3< cell_idx_t > dirVec) const; + + CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir, + cell_idx_t width) const; + bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const; + + void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const; + void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const; + + CellInterval getCoarseBlockCoalescenceInterval(const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, + const PdfField_T* field) const; + + const BlockDataID pdfFieldID_; + internal::NonuniformGPUPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_; + + public: + const BlockDataID commDataID_; +}; + +/*********************************************************************************************************************** + * Factory Functions * + **********************************************************************************************************************/ + +template< typename PdfField_T > +std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > > + setupNonuniformGPUPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks, + const BlockDataID pdfFieldID, + const std::string& dataIdentifier = "NonuniformGPUCommData"); + +} // namespace walberla::lbm_generated + +#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h" diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..adfbb419a8d3a3c82217fecf974977b28bb2a19b --- /dev/null +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h @@ -0,0 +1,713 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonuniformGeneratedGPUPdfPackInfo.impl.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "NonuniformGeneratedGPUPdfPackInfo.h" + +using namespace walberla::lbm_generated::util; + +namespace walberla::lbm_generated { + +/*********************************************************************************************************************** + * Factory Functions * + **********************************************************************************************************************/ + + +/** + * Sets up a NonuniformGeneratedPdfPackInfo. + * + * @tparam LatticeStorageSpecification_T + * @tparam PackingKernels_T + * @param blocks + * @param pdfFieldID + * @param dataIdentifier + * @return + */ +template< typename PdfField_T> +std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > > + setupNonuniformGPUPdfCommunication( const std::weak_ptr< StructuredBlockForest > & blocks, + const BlockDataID pdfFieldID, + const std::string & dataIdentifier) +{ + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + + auto sbf = blocks.lock(); + WALBERLA_CHECK_NOT_NULLPTR(sbf, "Trying to create Nonuniform GPU Packinfo for a block storage object that doesn't exist anymore" ); + + auto handling = std::make_shared<NonuniformGPUCommDataHandling< LatticeStorageSpecification_T > >(blocks); + BlockDataID commDataID = sbf->addBlockData(handling, dataIdentifier); + + return std::make_shared<NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >(pdfFieldID, commDataID); +} + + +/*********************************************************************************************************************** + * Equal Level Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver, + Direction dir, + GpuBuffer_T & buffer) +{ + auto field = receiver->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + cell_idx_t gls = skipsThroughCoarseBlock(receiver, dir) ? 2 : 1; + field->getGhostRegion(dir, ci, gls, false); + uint_t size = kernels_.size(ci, dir); + auto bufferPtr = buffer.advanceNoResize(size); + kernels_.unpackDirection(field, ci, bufferPtr, dir); +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalEqualLevel( + const Block* sender, Block* receiver, stencil::Direction dir, gpuStream_t stream) +{ + auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); + + CellInterval srcRegion; + CellInterval dstRegion; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false); + kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream); +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalEqualLevelCommFunction( + std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver, + stencil::Direction dir) +{ + auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); + + CellInterval srcRegion; + CellInterval dstRegion; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false); + +// VoidFunction t = std::bind(kernels_.localCopyDirection, +// srcField, srcRegion, dstField, dstRegion, dir, std::placeholders::_1 ); + +// CellInterval test(srcRegion.min(), srcRegion.max()); +// CellInterval test2(dstRegion.min(), dstRegion.max()); + + + auto commFunction = [this, srcField, srcRegion, dstField, dstRegion, dir](gpuStream_t gpuStream) + { + kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, gpuStream); + }; + commFunctions.emplace_back(commFunction); +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl( + const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const +{ + auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + field->getSliceBeforeGhostLayer(dir, ci, gls, false); + uint_t size = kernels_.size(ci, dir); + auto bufferPtr = buffer.advanceNoResize(size); + kernels_.packDirection(field, ci, bufferPtr, dir); +} + +/*********************************************************************************************************************** + * Coarse to Fine Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl( + const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const +{ + auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals); + + for (auto t : intervals) + { + CellInterval ci = t.second; + uint_t size = kernels_.size(ci); + auto bufferPtr = buffer.advanceNoResize(size); + kernels_.packAll(field, ci, bufferPtr); + } +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine( + Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer) +{ + auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals); + + for (auto t : intervals) + { + Direction d = t.first; + CellInterval ci = t.second; + uint_t size = kernels_.redistributeSize(ci); + auto bufferPtr = buffer.advanceNoResize(size); + kernels_.unpackRedistribute(field, ci, bufferPtr, d); + } +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine( + const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) +{ + auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > srcIntervals; + getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals); + + std::vector< std::pair< Direction, CellInterval > > dstIntervals; + getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals); + + WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size()) + + for(size_t index = 0; index < srcIntervals.size(); index++) + { + CellInterval srcInterval = srcIntervals[index].second; + + Direction const unpackDir = dstIntervals[index].first; + CellInterval dstInterval = dstIntervals[index].second; + + uint_t packSize = kernels_.size(srcInterval); + +#ifndef NDEBUG + Direction const packDir = srcIntervals[index].first; + WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) + uint_t unpackSize = kernels_.redistributeSize(dstInterval); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + // TODO: This is a dirty workaround. Code-generate direct redistribution! + unsigned char *buffer; + WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) + kernels_.packAll(srcField, srcInterval, buffer); + kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir); + WALBERLA_GPU_CHECK(gpuFree(buffer)) + } +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine( + const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) +{ + auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > srcIntervals; + getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals); + + std::vector< std::pair< Direction, CellInterval > > dstIntervals; + getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals); + + WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size()) + + for(size_t index = 0; index < srcIntervals.size(); index++) + { + CellInterval srcInterval = srcIntervals[index].second; + + Direction const unpackDir = dstIntervals[index].first; + CellInterval dstInterval = dstIntervals[index].second; + + uint_t packSize = kernels_.size(srcInterval); + +#ifndef NDEBUG + Direction const packDir = srcIntervals[index].first; + WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) + uint_t unpackSize = kernels_.redistributeSize(dstInterval); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + auto bufferPtr = buffer.advanceNoResize(packSize); + kernels_.packAll(srcField, srcInterval, bufferPtr, stream); + kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, stream); + } +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalCoarseToFineCommFunction( + std::vector< VoidFunction >& commFunctions, + const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) +{ + auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > srcIntervals; + getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals); + + std::vector< std::pair< Direction, CellInterval > > dstIntervals; + getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals); + + WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size()) + + for(size_t index = 0; index < srcIntervals.size(); index++) + { + CellInterval srcInterval = srcIntervals[index].second; + + Direction const unpackDir = dstIntervals[index].first; + CellInterval dstInterval = dstIntervals[index].second; + + uint_t packSize = kernels_.size(srcInterval); + +#ifndef NDEBUG + Direction const packDir = srcIntervals[index].first; + WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) + uint_t unpackSize = kernels_.redistributeSize(dstInterval); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + auto bufferPtr = buffer.advanceNoResize(packSize); + + auto commFunction = [this, srcField, srcInterval, bufferPtr, dstField, dstInterval, unpackDir](gpuStream_t gpuStream) + { + kernels_.packAll(srcField, srcInterval, bufferPtr, gpuStream); + kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, gpuStream); + }; + commFunctions.emplace_back(commFunction); + } +} + + + +/*********************************************************************************************************************** + * Fine to Coarse Communication * + **********************************************************************************************************************/ + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream) +{ + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + + for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); + if(coarseReceiver->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + CellInterval ci; + dstField->getSliceBeforeGhostLayer(*it, ci, 1); + kernels_.zeroCoalescenceRegion(dstField, ci, *it, gpuStream); + } + } +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataFineToCoarse( + Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir, + GpuBuffer_T & buffer) +{ + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + + CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField); + uint_t size = kernels_.size(ci, dir); + unsigned char* bufferPtr = buffer.advanceNoResize(size); + kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir); +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse( + const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir) +{ + auto varFineSender = const_cast< Block * >(fineSender); + auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); + auto srcCommData = varFineSender->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU()); + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + Direction invDir = stencil::inverseDir[dir]; + + CellInterval srcInterval; + srcField->getGhostRegion(dir, srcInterval, 2); + uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); + + CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), + invDir, dstField); + +#ifndef NDEBUG + uint_t unpackSize = kernels_.size(dstInterval, invDir); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + // TODO: This is a dirty workaround. Code-generate direct redistribution! + unsigned char *buffer; + WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) + kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir); + kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir); + WALBERLA_GPU_CHECK(gpuFree(buffer)) +} + + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse( + const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) +{ + auto varFineSender = const_cast< Block * >(fineSender); + auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); + auto srcCommData = varFineSender->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU()); + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + Direction invDir = stencil::inverseDir[dir]; + + CellInterval srcInterval; + srcField->getGhostRegion(dir, srcInterval, 2); + uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); + + CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), + invDir, dstField); + +#ifndef NDEBUG + uint_t unpackSize = kernels_.size(dstInterval, invDir); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + auto bufferPtr = buffer.advanceNoResize(packSize); + kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, stream); + kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream); +} + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalFineToCoarseCommFunction( + std::vector< VoidFunction >& commFunctions, + const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer) +{ + auto varFineSender = const_cast< Block * >(fineSender); + auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); + auto srcCommData = varFineSender->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU()); + auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); + Direction invDir = stencil::inverseDir[dir]; + + CellInterval srcInterval; + srcField->getGhostRegion(dir, srcInterval, 2); + uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); + + CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), + invDir, dstField); + +#ifndef NDEBUG + uint_t unpackSize = kernels_.size(dstInterval, invDir); + WALBERLA_ASSERT_EQUAL(packSize, unpackSize) +#endif + + auto bufferPtr = buffer.advanceNoResize(packSize); + auto commFunction = [this, srcField, maskField, srcInterval, bufferPtr, dir, dstField, dstInterval, invDir](gpuStream_t gpuStream) + { + kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, gpuStream); + kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, gpuStream); + }; + commFunctions.emplace_back(commFunction); +} + + +template< typename PdfField_T> +uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeEqualLevelSend( const Block * sender, stencil::Direction dir) +{ + auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; + field->getSliceBeforeGhostLayer(dir, ci, gls, false); + return kernels_.size(ci, dir); +} + + + +template< typename PdfField_T> +uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir) +{ + auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals); + + uint_t size = 0; + + for (auto t : intervals) + { + CellInterval ci = t.second; + size += kernels_.size(ci); + } + WALBERLA_ASSERT_GREATER(size, 0) + return size; +} + + + +template< typename PdfField_T> +uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( const Block * sender, stencil::Direction dir) +{ + auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + + CellInterval ci; + field->getGhostRegion(dir, ci, 2); + return kernels_.partialCoalescenceSize(ci, dir); +} + + + +template< typename PdfField_T> +void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl( + const Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir, + GpuBuffer_T & buffer) const +{ + auto varBlock = const_cast< Block* >(fineSender); + auto srcField = varBlock->getData< PdfField_T >(pdfFieldID_); + auto commData = varBlock->getData< CommData_T >(commDataID_); + PartialCoalescenceMaskFieldGPU * maskField = &(commData->getMaskFieldGPU()); + + CellInterval ci; + srcField->getGhostRegion(dir, ci, 2); + uint_t size = kernels_.partialCoalescenceSize(ci, dir); + auto bufferPtr = buffer.advanceNoResize(size); + kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir); +} + +/*********************************************************************************************************************** + * Helper Functions * + **********************************************************************************************************************/ + +template< typename PdfField_T> +inline Vector3< cell_idx_t > + NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getNeighborShift(const BlockID& fineBlock, + stencil::Direction dir) const +{ + // dir: direction from coarse to fine block, or vice versa + Vector3< cell_idx_t > shift; + + uint_t const branchId = fineBlock.getBranchId(); + + shift[0] = (stencil::cx[dir] == 0) ? (((branchId & uint_t(1)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0); + shift[1] = (stencil::cy[dir] == 0) ? (((branchId & uint_t(2)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0); + shift[2] = (Stencil::D == uint_t(3)) ? + ((stencil::cz[dir] == 0) ? (((branchId & uint_t(4)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) : + cell_idx_t(0)) : + cell_idx_t(0); + + return shift; +} + +/** + * Returns the part of a cell interval's hull of given width in direction dirVec. + * @param ci The original cell interval + * @param dirVec Direction Vector + * @param width Width of the hull + * @return Interval forming the part of the hull + */ +template< typename PdfField_T> +inline CellInterval NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::intervalHullInDirection( + const CellInterval& ci, const Vector3< cell_idx_t > dirVec, cell_idx_t width) const +{ + CellInterval result(ci); + for (uint_t i = 0; i < Stencil::D; i++) + { + if (dirVec[i] == 1) + { + result.min()[i] = result.max()[i] + cell_idx_t(1); + result.max()[i] += width; + } + if (dirVec[i] == -1) + { + result.max()[i] = result.min()[i] - cell_idx_t(1); + result.min()[i] -= width; + } + } + + return result; +} + +/** + * For edge or corner directions, checks if a coarser block is part of the respective edge or corner intersection. + * @param block The local block + * @param dir The direction to check + * @return `true` if dir is an edge or corner direction skipping through a coarser block. + */ +template< typename PdfField_T> +inline bool NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::skipsThroughCoarseBlock( + const Block* block, const Direction dir) const +{ + Vector3< cell_idx_t > dirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]); + bool coarseBlockFound = false; + forEachSubdirectionCancel(dirVec, [&](Vector3< cell_idx_t > subdir) { + coarseBlockFound = + coarseBlockFound || block->neighborhoodSectionHasLargerBlock( + blockforest::getBlockNeighborhoodSectionIndex(subdir[0], subdir[1], subdir[2])); + return !coarseBlockFound; + }); + + return coarseBlockFound; +} + +/** + * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval) + * mapping sub-directions of the communication direction to cell intervals on the coarse block interior + * whose data must be communicated <i>as if</i> communicating in those sub-directions. + * @param fineBlockID ID of the fine block + * @param dir Direction from the coarse to the fine block + * @param field Pointer to the PDF field on the coarse block + * @param intervals Vector that will be filled with the computed intervals + */ +template< typename PdfField_T> +inline void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getCoarseBlockCommIntervals( + const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const +{ + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir); + + CellInterval mainSlice; + field->getSliceBeforeGhostLayer(dir, mainSlice, 1, false); + + // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift + for (uint_t i = 0; i != Stencil::D; ++i) + { + if (shift[i] == cell_idx_t(-1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1); + } + if (shift[i] == cell_idx_t(1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)); + } + } + + intervals.emplace_back(dir, mainSlice); + + Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] }; + + // Get extended slices in all tangential directions for the diagonal part of communication + forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) { + CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(1)); + Direction subCommDir = stencil::vectorToDirection(commDirVec - t); + if(CommunicationStencil::containsDir(subCommDir)){ + intervals.emplace_back(subCommDir, hullInterval); + } + }); +} + +/** + * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval) + * mapping sub-directions of the communication direction to cell intervals on the fine block whose data must + * be communicated <i>as if</i> communicating in those sub-directions. + * @param fineBlockID ID of the fine block + * @param dir Direction from the fine to the coarse block + * @param field Pointer to the PDF Field on the fine block + * @param intervals Vector that will be filled with the computed intervals + */ +template< typename PdfField_T> +inline void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getFineBlockCommIntervals( + const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + std::vector< std::pair< Direction, CellInterval > >& intervals) const +{ + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir); + + CellInterval mainSlice; + field->getGhostRegion(dir, mainSlice, 2, false); + intervals.emplace_back(dir, mainSlice); + + Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] }; + + forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) { + CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(2)); + Direction subCommDir = stencil::vectorToDirection(commDirVec + t); + if(CommunicationStencil::containsDir(subCommDir)){ + intervals.emplace_back(subCommDir, hullInterval); + } + }); +} +/** + * Checks whether or not the block with ID `neighborID` is a neighbor of `block` in direction `dir`. + */ +template< typename PdfField_T> +bool NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::areNeighborsInDirection( + const Block* block, const BlockID& neighborID, const Vector3< cell_idx_t> dirVec) const +{ + uint_t const nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]); + uint_t const nSecSize = block->getNeighborhoodSectionSize(nSecIdx); + + for(uint_t i = 0; i < nSecSize; i++){ + if(block->getNeighborId(nSecIdx, i) == neighborID){ + return true; + } + } + return false; +} + +template< typename PdfField_T> +CellInterval NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getCoarseBlockCoalescenceInterval( + const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, const PdfField_T* field) const +{ + Direction mainDir(dir); + Vector3< cell_idx_t > commDirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]); + Vector3< cell_idx_t > mainDirVec(commDirVec); + bool isAsymmetric = !areNeighborsInDirection(coarseBlock, fineBlockID, commDirVec); + + // If asymmetric, find the main subdirection + if(isAsymmetric){ + mainDirVec = Vector3< cell_idx_t >(0); + forEachSubdirection(commDirVec, [&](Vector3< cell_idx_t > subdirVec){ + if(areNeighborsInDirection(coarseBlock, fineBlockID, subdirVec)){ + // -dir is one main communication direction from F to C, but, due to periodicity, + // it might not be the only one. Find the main comm direction from the subdirections + // that is largest in the 1-norm. + if(subdirVec.sqrLength() > mainDirVec.sqrLength()) mainDirVec = subdirVec; + } + }); + mainDir = stencil::vectorToDirection(mainDirVec); + } + + Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, mainDir); + + CellInterval mainSlice; + field->getSliceBeforeGhostLayer(mainDir, mainSlice, 1, false); + + // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift + for (uint_t i = 0; i != Stencil::D; ++i) + { + if (shift[i] == cell_idx_t(-1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1); + } + if (shift[i] == cell_idx_t(1)) + { + WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0) + mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)); + } + } + + CellInterval commSlice(mainSlice); + + // If asymmetric, find coalescence slice as hull of main slice + if(isAsymmetric){ + commSlice = intervalHullInDirection(mainSlice, mainDirVec - commDirVec, 1); + } + + return commSlice; +} + +} // walberla::lbm_generated diff --git a/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..894eb38034881feeda40c1a3d051455cbe98e173 --- /dev/null +++ b/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h @@ -0,0 +1,272 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGeneratedGPUPdfPackInfo.h +//! \ingroup lbm +//! \author Markus Holzer <markus.holzer@fau.de> +//! \brief Class Template for Lattice Boltzmann PDF Pack Infos using code-generated kernels +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" + +#include "gpu/GPUWrapper.h" +#include "gpu/communication/GeneratedGPUPackInfo.h" + +#include "lbm/field/PdfField.h" + +#include "stencil/Directions.h" + +namespace walberla +{ +using gpu::GeneratedGPUPackInfo; + +namespace lbm_generated +{ +using stencil::Direction; + +namespace internal +{ +/* + * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to + * kernels generated for in-place streaming patterns. The generated code should not be templated. + */ +template< typename PdfField_T, bool inplace > +class UniformPackingGPUKernelsWrapper +{ + public: + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const = 0; + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const = 0; + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval, + gpuStream_t stream) const = 0; + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir, + gpuStream_t stream) const = 0; + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir, + gpuStream_t stream) const = 0; + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir, gpuStream_t stream) const = 0; + + uint_t size(CellInterval& ci, Direction dir) const = 0; + uint_t size(CellInterval& ci) const = 0; +}; + +/* + * Template Specialization for two-fields patterns, with trivial method wrappers. + */ +template< typename PdfField_T > +class UniformPackingGPUKernelsWrapper< PdfField_T, false > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const + { + kernels_.packAll(srcField, ci, outBuffer, stream); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const + { + kernels_.unpackAll(dstField, ci, inBuffer, stream); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval, + gpuStream_t stream) const + { + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, stream); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir, + gpuStream_t stream) const + { + kernels_.packDirection(srcField, ci, outBuffer, dir, stream); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir, + gpuStream_t stream) const + { + kernels_.unpackDirection(dstField, ci, inBuffer, dir, stream); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir, gpuStream_t stream) const + { + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream); + } + + uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + + private: + PackingKernels_T kernels_; +}; + +/* + * Template Specialization for in-place patterns, extracting the timestep from the lattice model. + */ +template< typename PdfField_T > +class UniformPackingGPUKernelsWrapper< PdfField_T, true > +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + + void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packAll(srcField, ci, outBuffer, timestep, stream); + } + + void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackAll(dstField, ci, inBuffer, timestep, stream); + } + + void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval, + gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep, stream); + } + + void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir, + gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + kernels_.packDirection(srcField, ci, outBuffer, dir, timestep, stream); + } + + void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir, + gpuStream_t stream) const + { + uint8_t timestep = dstField->getTimestep(); + kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep, stream); + } + + void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, + CellInterval& dstInterval, Direction dir, gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep()) + kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream); + } + + uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); } + uint_t size(CellInterval& ci) const { return kernels_.size(ci); } + + private: + PackingKernels_T kernels_; +}; +} // namespace internal + +/** + * Pack Info class template for lattice Boltzmann PDF fields. Relies on a code-generated + * class providing kernel implementations for packing, unpacking and local copying of data. + * + * This template relies on a PackingKernels implementation generated by lbmpy_walberla.packing_kernels. + * The code generated part provides the kernels for transferring data between communication buffers + * and fields. The iteration slices are constructed by this class. + * + * The code-generated substructure enables the usage of arbitrary, in particular in-place streaming + * patterns. + * + * @tparam PackingKernels_T Type of a PackingKernels implementation generated using + * `lbmpy_walberla.generate_packing_kernels`. + * + * \ingroup lbm + */ +template< typename PdfField_T > +class UniformGeneratedGPUPdfPackInfo : public GeneratedGPUPackInfo +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + + UniformGeneratedGPUPdfPackInfo(const BlockDataID pdfFieldID, cell_idx_t cellLayersToSend = 1, bool sendAll = false) + : pdfFieldID_(pdfFieldID), ghostLayersToSend_(cellLayersToSend), sendAll_(sendAll) + {} + + void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override; + void communicateLocal(stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream) override; + void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override; + uint_t size(stencil::Direction dir, IBlock* block) override; + + private: + const BlockDataID pdfFieldID_; + internal::UniformPackingGPUKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_; + cell_idx_t ghostLayersToSend_; + bool sendAll_; +}; + +template< typename PdfField_T > +void UniformGeneratedGPUPdfPackInfo< PdfField_T >::unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, + gpuStream_t stream) +{ + auto field = block->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + field->getGhostRegion(dir, ci, ghostLayersToSend_, false); + + if (sendAll_) { kernels_.unpackAll(field, ci, buffer, stream); } + else { kernels_.unpackDirection(field, ci, buffer, dir, stream); } +} + +template< typename PdfField_T > +void UniformGeneratedGPUPdfPackInfo< PdfField_T >::pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, + gpuStream_t stream) +{ + auto field = const_cast< IBlock* >(block)->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + field->getSliceBeforeGhostLayer(dir, ci, ghostLayersToSend_, false); + + if (sendAll_) { kernels_.packAll(field, ci, buffer, stream); } + else { kernels_.packDirection(field, ci, buffer, dir, stream); } +} + +template< typename PdfField_T > +void UniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocal(stencil::Direction dir, const IBlock* sender, + IBlock* receiver, gpuStream_t stream) +{ + auto srcField = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); + + CellInterval srcRegion; + CellInterval dstRegion; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, ghostLayersToSend_, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, ghostLayersToSend_, false); + + if (sendAll_) { kernels_.localCopyAll(srcField, srcRegion, dstField, dstRegion, stream); } + else { kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream); } +} + +template< typename PdfField_T > +uint_t UniformGeneratedGPUPdfPackInfo< PdfField_T >::size(stencil::Direction dir, IBlock* block) +{ + auto field = block->getData< PdfField_T >(pdfFieldID_); + CellInterval ci; + field->getGhostRegion(dir, ci, 1, false); + + uint_t elementsPerCell = kernels_.size(ci, dir); + return elementsPerCell; +} + +} // namespace lbm_generated +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h new file mode 100644 index 0000000000000000000000000000000000000000..6b0a2a7ece5768fb071776e6aa9d0ea05dc9b797 --- /dev/null +++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h @@ -0,0 +1,97 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicRecursiveTimeStep.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/communication/NonUniformBufferedScheme.h" + +#include "lbm/field/PdfField.h" +#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h" + +#include "timeloop/SweepTimeloop.h" + +namespace walberla { + +using blockforest::communication::NonUniformBufferedScheme; + +namespace lbm_generated { + +/** + * + * @tparam LatticeStorageSpecification_T Generated storage specification + * @tparam SweepCollection_T LBM SweepCollection (must be able to call stream, collide, streamCollide and streamOnlyNoAdvancement) + * @tparam BoundaryCollection_T LBM Boundary collection (Functor that runs all boundary kernels at call) + */ +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T> +class BasicRecursiveTimeStep +{ + public: + using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; + using Stencil = typename LatticeStorageSpecification_T::Stencil; + using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; + using CommScheme = NonUniformBufferedScheme< CommunicationStencil >; + using PackInfo = lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >; + + BasicRecursiveTimeStep(std::shared_ptr< StructuredBlockForest > & sbfs, + const BlockDataID & pdfFieldId, SweepCollection_T & sweepCollection, BoundaryCollection_T & boundaryCollection, + std::shared_ptr< CommScheme > & commScheme, std::shared_ptr< PackInfo > & pdfFieldPackInfo): + sbfs_(sbfs), pdfFieldId_(pdfFieldId), pdfFieldPackInfo_(pdfFieldPackInfo), commScheme_(commScheme), + sweepCollection_(sweepCollection), boundaryCollection_(boundaryCollection) + { +#ifndef NDEBUG + for (auto& block : *sbfs) + WALBERLA_ASSERT(block.isDataOfType<PdfField_T>(pdfFieldId_), "Template parameter PdfField_T is of different type than BlockDataID pdfFieldId that is provided as constructor argument") +#endif + maxLevel_ = sbfs->getDepth(); + + for (uint_t level = 0; level <= maxLevel_; level++) + { + std::vector<Block *> blocks; + sbfs->getBlocks(blocks, level); + blocks_.push_back(blocks); + } + }; + + void operator() () { timestep(0); }; + void addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level=0); + + private: + void timestep(uint_t level); + void ghostLayerPropagation(Block * block); + std::function<void()> executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation=false); + std::function<void()> executeBoundaryHandlingOnLevel(uint_t level); + + std::shared_ptr< StructuredBlockForest > sbfs_; + uint_t maxLevel_; + std::vector<std::vector<Block *>> blocks_; + + const BlockDataID pdfFieldId_; + std::shared_ptr< PackInfo > pdfFieldPackInfo_; + std::shared_ptr< CommScheme > commScheme_; + + SweepCollection_T & sweepCollection_; + BoundaryCollection_T & boundaryCollection_; +}; + +} // namespace lbm_generated +} // namespace walberla + +#include "lbm_generated/refinement/BasicRecursiveTimeStep.impl.h" diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..7e6d9b5944e0e526287fba475c42e07f70695e7d --- /dev/null +++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h @@ -0,0 +1,266 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicRecursiveTimeStep.impl.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "BasicRecursiveTimeStep.h" + +namespace walberla { +namespace lbm_generated { + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::timestep(uint_t level) +{ + // 1.1 Collision + for(auto b: blocks_[level]){ + sweepCollection_.streamCollide(b); + } + + // 1.2 Recursive Descent + if(level < maxLevel_){ + timestep(level + 1); + } + + // 1.3 Coarse to Fine Communication, receiving end + if(level != 0){ + commScheme_->communicateCoarseToFine(level); + } + + // 1.4 Equal-Level Communication + commScheme_->communicateEqualLevel(level); + + // 1.5 Boundary Handling and Coalescence Preparation + for(auto b : blocks_[level]){ + boundaryCollection_(b); + if(level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b); + } + + // 1.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_){ + commScheme_->communicateFineToCoarse(level + 1); + } + + // Stop here if on coarsest level. + // Otherwise, continue to second subcycle. + if(level == 0) return; + + // 2.1 Collision and Ghost-Layer Propagation + for(auto b: blocks_[level]){ + ghostLayerPropagation(b); // GL-Propagation first without swapping arrays... + sweepCollection_.streamCollide(b); // then Stream-Collide on interior, and swap arrays + } + + // 2.2 Recursive Descent + if(level < maxLevel_){ + timestep(level + 1); + } + + // 2.4 Equal-Level Communication + commScheme_->communicateEqualLevel(level); + + // 2.5 Boundary Handling and Coalescence Preparation + for(auto b : blocks_[level]){ + boundaryCollection_(b); + if(level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b); + } + + // 2.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_){ + commScheme_->communicateFineToCoarse(level + 1); + } +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(timeloop::SweepTimeloop & timeloop, uint_t level) +{ + // 1.1 Collision + timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level)); + + // 1.2 Recursive Descent + if(level < maxLevel_){ + addRefinementToTimeLoop(timeloop, level + 1); + } + + // 1.3 Coarse to Fine Communication, receiving end + if(level != 0){ + timeloop.addFuncBeforeTimeStep(commScheme_->communicateCoarseToFineFunctor(level), "Refinement Cycle: communicate coarse to fine on level " + std::to_string(level)); + } + + // 1.4 Equal-Level Communication + timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + + // 1.5 Boundary Handling and Coalescence Preparation + timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 1.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_){ + timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + } + + // Stop here if on coarsest level. + // Otherwise, continue to second subcycle. + if(level == 0) return; + + // 2.1 Collision and Ghost-Layer Propagation + timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, true), "Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level)); + + // 2.2 Recursive Descent + if(level < maxLevel_) + addRefinementToTimeLoop(timeloop, level + 1); + + + // 2.4 Equal-Level Communication + timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level)); + + // 2.5 Boundary Handling and Coalescence Preparation + timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level)); + + // 2.6 Fine to Coarse Communication, receiving end + if(level < maxLevel_) + timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1)); + +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +std::function<void()> BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation) +{ + return [level, withGhostLayerPropagation, this]() + { + if (withGhostLayerPropagation) + { + for(auto b: blocks_[level]){ + ghostLayerPropagation(b); + sweepCollection_.streamCollide(b); + } + } + else + { + for(auto b: blocks_[level]){ + sweepCollection_.streamCollide(b); + } + } + }; +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +std::function<void()> BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeBoundaryHandlingOnLevel(uint_t level) +{ + return [level, this]() { + for (auto b : blocks_[level]) + { + boundaryCollection_(b); + if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b); + } + }; +} + + +template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T > +void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation( + Block * block) +{ + auto pdfField = block->getData<PdfField_T>(pdfFieldId_); + + for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ + uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); + // Propagate on ghost layers shadowing coarse or no blocks + if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + CellInterval ci; + pdfField->getGhostRegion(*it, ci, 1); + sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci); + } + } +} + +// Refinement Timestep from post collision state: +//template< typename PdfField_T, typename LbSweep_T > +//void BasicRecursiveTimeStep< PdfField_T, LbSweep_T >::timestep(uint_t level) +//{ +// std::vector<Block *> blocks; +// sbfs_->getBlocks(blocks, level); +// +// uint_t maxLevel = sbfs_->getDepth(); +// +// // 1.1 Equal-Level Communication +// commScheme_->communicateEqualLevel(level); +// +// // 1.2 Coarse to Fine Communication +// if(level < maxLevel){ +// commScheme_->communicateCoarseToFine(level + 1); +// } +// +// // 1.3 Boundary Handling and +// // 1.4 Prepare Coalescence (which happens during the recursive descent) +// for(auto b : blocks){ +// boundaryFunctor_(b); +// if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b); +// } +// +// // 1.5 Recursive Descent +// if(level < maxLevel){ +// timestep(level + 1); +// } +// +// // 1.6 First Collision and ghost-layer propagation +// for(auto b: blocks){ +// if(level != 0) ghostLayerPropagation(b); // GL-Propagation first without swapping arrays... +// sweepCollection_.streamCollide(b); // then Stream-Collide on interior, and swap arrays +// } +// +// // Stop here if on coarsest level. +// // Otherwise, continue to second subcycle. +// if(level == 0) return; +// +// // 2.1 Equal-Level Communication +// commScheme_->communicateEqualLevel(level); +// +// // 2.2 Coarse to Fine Communication +// if(level < maxLevel){ +// commScheme_->communicateCoarseToFine(level + 1); +// } +// +// // 2.3 Boundary Handling and +// // 2.4 Prepare Coalescence (which happens during the recursive descent) +// for(auto b : blocks){ +// boundaryFunctor_(b); +// if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b); +// } +// +// // 2.5 Recursive Descent +// if(level < maxLevel){ +// timestep(level + 1); +// } +// +// // 2.6 Fine to Coarse Communication +// commScheme_->communicateFineToCoarse(level); +// +// // 2.7 Second Collision +// for(auto b: blocks){ +// sweepCollection_.streamCollide(b); +// } +//} + +} // namespace lbm_generated +} // namespace walberla diff --git a/src/lbm_generated/refinement/CMakeLists.txt b/src/lbm_generated/refinement/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..216b4a2683ebc426c8f30a5135d5c07db3409640 --- /dev/null +++ b/src/lbm_generated/refinement/CMakeLists.txt @@ -0,0 +1,6 @@ +target_sources( lbm_generated + PRIVATE + BasicRecursiveTimeStep.h + BasicRecursiveTimeStep.impl.h + RefinementScaling.h + ) diff --git a/src/lbm_generated/refinement/RefinementScaling.h b/src/lbm_generated/refinement/RefinementScaling.h new file mode 100644 index 0000000000000000000000000000000000000000..f8015946a4816e4c0e7c54ea43d2f310755aaec3 --- /dev/null +++ b/src/lbm_generated/refinement/RefinementScaling.h @@ -0,0 +1,63 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file RefinementScaling.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/BlockDataHandling.h" + +#include "domain_decomposition/IBlock.h" +#include "domain_decomposition/StructuredBlockStorage.h" + +namespace walberla +{ +namespace lbm_generated +{ + +class DefaultRefinementScaling : public blockforest::AlwaysInitializeBlockDataHandling< real_t > +{ + public: + DefaultRefinementScaling(const weak_ptr< StructuredBlockStorage >& blocks, const real_t parameter) + : blocks_(blocks), parameter_(parameter){}; + + real_t* initialize(IBlock* const block) override + { + WALBERLA_ASSERT_NOT_NULLPTR(block) + auto blocks = blocks_.lock(); + WALBERLA_CHECK_NOT_NULLPTR(blocks) + + level_ = block->getBlockStorage().getLevel(*block); + + const real_t level_scale_factor = real_c(uint_t(1) << level_); + const real_t one = real_c(1.0); + const real_t half = real_c(0.5); + + return new real_t(parameter_ / (level_scale_factor * (-parameter_ * half + one) + parameter_ * half)); + } + bool operator==(const DefaultRefinementScaling& other) const { return level_ == other.level_; } + + private: + const weak_ptr< StructuredBlockStorage > blocks_; + const real_t parameter_; + + uint_t level_; +}; + +} // namespace lbm_generated +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/CMakeLists.txt b/src/lbm_generated/storage_specification/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..83d211632ca9366f1ac5f719a22d217f7c176061 --- /dev/null +++ b/src/lbm_generated/storage_specification/CMakeLists.txt @@ -0,0 +1,7 @@ +target_sources( lbm_generated + PRIVATE + D3Q19StorageSpecification.h + D3Q19StorageSpecification.cpp + D3Q27StorageSpecification.h + D3Q27StorageSpecification.cpp + ) \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f36797eecca7282cf1f615492ac54cee38be871f --- /dev/null +++ b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp @@ -0,0 +1,1939 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q19StorageSpecification.cpp +//! \\author lbmpy +//====================================================================================================================== + +#include "D3Q19StorageSpecification.h" + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wfloat-equal" +# pragma GCC diagnostic ignored "-Wshadow" +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +/************************************************************************************* + * Kernel Definitions +*************************************************************************************/ +namespace internal_d3q19storagespecification_pack_ALL { +static FUNC_PREFIX void d3q19storagespecification_pack_ALL(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 1] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 3] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 4] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 5] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 6] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 7] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 8] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 9] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 10] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 11] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 12] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 13] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 14] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 15] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 16] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 17] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 18] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_ALL { +static FUNC_PREFIX void d3q19storagespecification_unpack_ALL(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0; + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30; + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2]; + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 1]; + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 2]; + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 3]; + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 4]; + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 5]; + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 6]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 7]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 8]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 9]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 10]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 11]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 12]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 13]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 14]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 15]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 16]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 17]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 18]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_ALL { +static FUNC_PREFIX void d3q19storagespecification_localCopy_ALL(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0; + double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0; + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30; + double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30; + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + + +namespace internal_d3q19storagespecification_pack_TE { +static FUNC_PREFIX void d3q19storagespecification_pack_TE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_SW { +static FUNC_PREFIX void d3q19storagespecification_pack_SW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_T { +static FUNC_PREFIX void d3q19storagespecification_pack_T(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_BS { +static FUNC_PREFIX void d3q19storagespecification_pack_BS(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_TN { +static FUNC_PREFIX void d3q19storagespecification_pack_TN(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_BW { +static FUNC_PREFIX void d3q19storagespecification_pack_BW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_N { +static FUNC_PREFIX void d3q19storagespecification_pack_N(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_E { +static FUNC_PREFIX void d3q19storagespecification_pack_E(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_NW { +static FUNC_PREFIX void d3q19storagespecification_pack_NW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_NE { +static FUNC_PREFIX void d3q19storagespecification_pack_NE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_TW { +static FUNC_PREFIX void d3q19storagespecification_pack_TW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_BE { +static FUNC_PREFIX void d3q19storagespecification_pack_BE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_W { +static FUNC_PREFIX void d3q19storagespecification_pack_W(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_S { +static FUNC_PREFIX void d3q19storagespecification_pack_S(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_SE { +static FUNC_PREFIX void d3q19storagespecification_pack_SE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_TS { +static FUNC_PREFIX void d3q19storagespecification_pack_TS(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_BN { +static FUNC_PREFIX void d3q19storagespecification_pack_BN(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_pack_B { +static FUNC_PREFIX void d3q19storagespecification_pack_B(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_BW { +static FUNC_PREFIX void d3q19storagespecification_unpack_BW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_N { +static FUNC_PREFIX void d3q19storagespecification_unpack_N(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_SE { +static FUNC_PREFIX void d3q19storagespecification_unpack_SE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_TE { +static FUNC_PREFIX void d3q19storagespecification_unpack_TE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_T { +static FUNC_PREFIX void d3q19storagespecification_unpack_T(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_TS { +static FUNC_PREFIX void d3q19storagespecification_unpack_TS(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_BE { +static FUNC_PREFIX void d3q19storagespecification_unpack_BE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_NW { +static FUNC_PREFIX void d3q19storagespecification_unpack_NW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_NE { +static FUNC_PREFIX void d3q19storagespecification_unpack_NE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_BS { +static FUNC_PREFIX void d3q19storagespecification_unpack_BS(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_E { +static FUNC_PREFIX void d3q19storagespecification_unpack_E(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_S { +static FUNC_PREFIX void d3q19storagespecification_unpack_S(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_W { +static FUNC_PREFIX void d3q19storagespecification_unpack_W(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_SW { +static FUNC_PREFIX void d3q19storagespecification_unpack_SW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_B { +static FUNC_PREFIX void d3q19storagespecification_unpack_B(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_TN { +static FUNC_PREFIX void d3q19storagespecification_unpack_TN(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_BN { +static FUNC_PREFIX void d3q19storagespecification_unpack_BN(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_unpack_TW { +static FUNC_PREFIX void d3q19storagespecification_unpack_TW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_NE { +static FUNC_PREFIX void d3q19storagespecification_localCopy_NE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_TS { +static FUNC_PREFIX void d3q19storagespecification_localCopy_TS(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_BE { +static FUNC_PREFIX void d3q19storagespecification_localCopy_BE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_BS { +static FUNC_PREFIX void d3q19storagespecification_localCopy_BS(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_BW { +static FUNC_PREFIX void d3q19storagespecification_localCopy_BW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_T { +static FUNC_PREFIX void d3q19storagespecification_localCopy_T(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_TN { +static FUNC_PREFIX void d3q19storagespecification_localCopy_TN(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_W { +static FUNC_PREFIX void d3q19storagespecification_localCopy_W(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_E { +static FUNC_PREFIX void d3q19storagespecification_localCopy_E(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_TW { +static FUNC_PREFIX void d3q19storagespecification_localCopy_TW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_SW { +static FUNC_PREFIX void d3q19storagespecification_localCopy_SW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_NW { +static FUNC_PREFIX void d3q19storagespecification_localCopy_NW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_BN { +static FUNC_PREFIX void d3q19storagespecification_localCopy_BN(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_TE { +static FUNC_PREFIX void d3q19storagespecification_localCopy_TE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_B { +static FUNC_PREFIX void d3q19storagespecification_localCopy_B(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_N { +static FUNC_PREFIX void d3q19storagespecification_localCopy_N(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_S { +static FUNC_PREFIX void d3q19storagespecification_localCopy_S(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q19storagespecification_localCopy_SE { +static FUNC_PREFIX void d3q19storagespecification_localCopy_SE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + + + + +/************************************************************************************* + * Kernel Wrappers +*************************************************************************************/ + +namespace walberla { +namespace lbm { + + void D3Q19StorageSpecification::PackKernels::packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const + { + double * buffer = reinterpret_cast<double*>(outBuffer); + double * RESTRICT _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + internal_d3q19storagespecification_pack_ALL::d3q19storagespecification_pack_ALL(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + } + + + void D3Q19StorageSpecification::PackKernels::unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const + { + double * buffer = reinterpret_cast<double*>(inBuffer); + double * RESTRICT const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + internal_d3q19storagespecification_unpack_ALL::d3q19storagespecification_unpack_ALL(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + } + + + void D3Q19StorageSpecification::PackKernels::localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + internal_d3q19storagespecification_localCopy_ALL::d3q19storagespecification_localCopy_ALL(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + } + + void D3Q19StorageSpecification::PackKernels::packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const + { + double * buffer = reinterpret_cast<double*>(outBuffer); + double * RESTRICT _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q19storagespecification_pack_N::d3q19storagespecification_pack_N(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::S : { + internal_d3q19storagespecification_pack_S::d3q19storagespecification_pack_S(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::W : { + internal_d3q19storagespecification_pack_W::d3q19storagespecification_pack_W(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::E : { + internal_d3q19storagespecification_pack_E::d3q19storagespecification_pack_E(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::T : { + internal_d3q19storagespecification_pack_T::d3q19storagespecification_pack_T(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::B : { + internal_d3q19storagespecification_pack_B::d3q19storagespecification_pack_B(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NW : { + internal_d3q19storagespecification_pack_NW::d3q19storagespecification_pack_NW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NE : { + internal_d3q19storagespecification_pack_NE::d3q19storagespecification_pack_NE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SW : { + internal_d3q19storagespecification_pack_SW::d3q19storagespecification_pack_SW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SE : { + internal_d3q19storagespecification_pack_SE::d3q19storagespecification_pack_SE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TN : { + internal_d3q19storagespecification_pack_TN::d3q19storagespecification_pack_TN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TS : { + internal_d3q19storagespecification_pack_TS::d3q19storagespecification_pack_TS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TW : { + internal_d3q19storagespecification_pack_TW::d3q19storagespecification_pack_TW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TE : { + internal_d3q19storagespecification_pack_TE::d3q19storagespecification_pack_TE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BN : { + internal_d3q19storagespecification_pack_BN::d3q19storagespecification_pack_BN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BS : { + internal_d3q19storagespecification_pack_BS::d3q19storagespecification_pack_BS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BW : { + internal_d3q19storagespecification_pack_BW::d3q19storagespecification_pack_BW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BE : { + internal_d3q19storagespecification_pack_BE::d3q19storagespecification_pack_BE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + }default: break; + } + } + + void D3Q19StorageSpecification::PackKernels::unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const + { + double * buffer = reinterpret_cast<double*>(inBuffer); + double * RESTRICT const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q19storagespecification_unpack_N::d3q19storagespecification_unpack_N(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::S : { + internal_d3q19storagespecification_unpack_S::d3q19storagespecification_unpack_S(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::W : { + internal_d3q19storagespecification_unpack_W::d3q19storagespecification_unpack_W(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::E : { + internal_d3q19storagespecification_unpack_E::d3q19storagespecification_unpack_E(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::T : { + internal_d3q19storagespecification_unpack_T::d3q19storagespecification_unpack_T(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::B : { + internal_d3q19storagespecification_unpack_B::d3q19storagespecification_unpack_B(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::NW : { + internal_d3q19storagespecification_unpack_NW::d3q19storagespecification_unpack_NW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::NE : { + internal_d3q19storagespecification_unpack_NE::d3q19storagespecification_unpack_NE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::SW : { + internal_d3q19storagespecification_unpack_SW::d3q19storagespecification_unpack_SW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::SE : { + internal_d3q19storagespecification_unpack_SE::d3q19storagespecification_unpack_SE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TN : { + internal_d3q19storagespecification_unpack_TN::d3q19storagespecification_unpack_TN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TS : { + internal_d3q19storagespecification_unpack_TS::d3q19storagespecification_unpack_TS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TW : { + internal_d3q19storagespecification_unpack_TW::d3q19storagespecification_unpack_TW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TE : { + internal_d3q19storagespecification_unpack_TE::d3q19storagespecification_unpack_TE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BN : { + internal_d3q19storagespecification_unpack_BN::d3q19storagespecification_unpack_BN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BS : { + internal_d3q19storagespecification_unpack_BS::d3q19storagespecification_unpack_BS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BW : { + internal_d3q19storagespecification_unpack_BW::d3q19storagespecification_unpack_BW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BE : { + internal_d3q19storagespecification_unpack_BE::d3q19storagespecification_unpack_BE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + }default: break; + } + } + + void D3Q19StorageSpecification::PackKernels::localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q19storagespecification_localCopy_N::d3q19storagespecification_localCopy_N(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::S : { + internal_d3q19storagespecification_localCopy_S::d3q19storagespecification_localCopy_S(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::W : { + internal_d3q19storagespecification_localCopy_W::d3q19storagespecification_localCopy_W(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::E : { + internal_d3q19storagespecification_localCopy_E::d3q19storagespecification_localCopy_E(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::T : { + internal_d3q19storagespecification_localCopy_T::d3q19storagespecification_localCopy_T(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::B : { + internal_d3q19storagespecification_localCopy_B::d3q19storagespecification_localCopy_B(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NW : { + internal_d3q19storagespecification_localCopy_NW::d3q19storagespecification_localCopy_NW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NE : { + internal_d3q19storagespecification_localCopy_NE::d3q19storagespecification_localCopy_NE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SW : { + internal_d3q19storagespecification_localCopy_SW::d3q19storagespecification_localCopy_SW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SE : { + internal_d3q19storagespecification_localCopy_SE::d3q19storagespecification_localCopy_SE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TN : { + internal_d3q19storagespecification_localCopy_TN::d3q19storagespecification_localCopy_TN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TS : { + internal_d3q19storagespecification_localCopy_TS::d3q19storagespecification_localCopy_TS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TW : { + internal_d3q19storagespecification_localCopy_TW::d3q19storagespecification_localCopy_TW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TE : { + internal_d3q19storagespecification_localCopy_TE::d3q19storagespecification_localCopy_TE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BN : { + internal_d3q19storagespecification_localCopy_BN::d3q19storagespecification_localCopy_BN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BS : { + internal_d3q19storagespecification_localCopy_BS::d3q19storagespecification_localCopy_BS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BW : { + internal_d3q19storagespecification_localCopy_BW::d3q19storagespecification_localCopy_BW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BE : { + internal_d3q19storagespecification_localCopy_BE::d3q19storagespecification_localCopy_BE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + }default: break; + } + } + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h new file mode 100644 index 0000000000000000000000000000000000000000..5f0342741639be847c78026bacd7763e10f07463 --- /dev/null +++ b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h @@ -0,0 +1,147 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q19StorageSpecification.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" +#include "core/mpi/SendBuffer.h" +#include "core/mpi/RecvBuffer.h" + +#include "domain_decomposition/IBlock.h" +#include "field/GhostLayerField.h" + +#include "stencil/D3Q19.h" +#include "stencil/Directions.h" + +#define FUNC_PREFIX + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +namespace walberla +{ +namespace lbm{ + +class D3Q19StorageSpecification +{ + public: + // Used lattice stencil + using Stencil = stencil::D3Q19; + // Lattice stencil used for the communication (should be used to define which block directions need to be communicated) + using CommunicationStencil = stencil::D3Q19; + + // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997 + static const bool compressible = false; + // Cut off for the lattice Boltzmann equilibrium + static const int equilibriumAccuracyOrder = 2; + + // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push) + static const bool inplace = false; + + // If true the background deviation (rho_0 = 1) is subtracted for the collision step. + static const bool zeroCenteredPDFs = true; + // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho" + static const bool deviationOnlyEquilibrium = true; + + // Compute kernels to pack and unpack MPI buffers + class PackKernels { + + public: + using PdfField_T = field::GhostLayerField<double, 19>; + using value_type = typename PdfField_T::value_type; + + + + static const bool inplace = false; + + /** + * Packs all pdfs from the given cell interval to the send buffer. + * */ + void packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const; + + /** + * Unpacks all pdfs from the send buffer to the given cell interval. + * */ + void unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const; + + /** + * Copies data between two blocks on the same process. + * All pdfs from the sending interval are copied onto the receiving interval. + * */ + void localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const; + + /** + * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed. + * */ + void packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const; + + /** + * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked. + * */ + void unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const; + + /** Copies data between two blocks on the same process. + * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval. + * */ + void localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const; + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packDirection / unpackDirection + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci, stencil::Direction dir) const { + return ci.numCells() * sizes[dir] * sizeof(value_type); + } + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packAll / unpackAll + * @param ci The cell interval + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci) const { + return ci.numCells() * 19 * sizeof(value_type); + } + + + + private: + const uint_t sizes[27] { 0, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; + }; + +}; + +}} //lbm/walberla \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3ecdf88928bf8292254465e0f4ec19d4a1106373 --- /dev/null +++ b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp @@ -0,0 +1,3099 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q27StorageSpecification.cpp +//! \\author lbmpy +//====================================================================================================================== + +#include "D3Q27StorageSpecification.h" + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wfloat-equal" +# pragma GCC diagnostic ignored "-Wshadow" +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +/************************************************************************************* + * Kernel Definitions +*************************************************************************************/ +namespace internal_d3q27storagespecification_pack_ALL { +static FUNC_PREFIX void d3q27storagespecification_pack_ALL(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 1] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 3] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 4] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 5] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 6] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 7] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 8] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 9] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 10] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 11] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 12] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 13] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 14] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 15] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 16] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 17] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 18] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 19] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 20] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 21] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 22] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 23] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 24] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 25] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 26] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_ALL { +static FUNC_PREFIX void d3q27storagespecification_unpack_ALL(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0; + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30; + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2]; + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 1]; + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 2]; + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 3]; + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 4]; + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 5]; + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 6]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 7]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 8]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 9]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 10]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 11]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 12]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 13]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 14]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 15]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 16]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 17]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 18]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 19]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 20]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 21]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 22]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 23]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 24]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 25]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 26]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_ALL { +static FUNC_PREFIX void d3q27storagespecification_localCopy_ALL(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0; + double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0; + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30; + double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30; + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + + +namespace internal_d3q27storagespecification_pack_T { +static FUNC_PREFIX void d3q27storagespecification_pack_T(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BN { +static FUNC_PREFIX void d3q27storagespecification_pack_BN(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_NE { +static FUNC_PREFIX void d3q27storagespecification_pack_NE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BNE { +static FUNC_PREFIX void d3q27storagespecification_pack_BNE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_SE { +static FUNC_PREFIX void d3q27storagespecification_pack_SE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TNW { +static FUNC_PREFIX void d3q27storagespecification_pack_TNW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_W { +static FUNC_PREFIX void d3q27storagespecification_pack_W(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TE { +static FUNC_PREFIX void d3q27storagespecification_pack_TE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_N { +static FUNC_PREFIX void d3q27storagespecification_pack_N(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BSW { +static FUNC_PREFIX void d3q27storagespecification_pack_BSW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TSW { +static FUNC_PREFIX void d3q27storagespecification_pack_TSW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BE { +static FUNC_PREFIX void d3q27storagespecification_pack_BE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_B { +static FUNC_PREFIX void d3q27storagespecification_pack_B(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TNE { +static FUNC_PREFIX void d3q27storagespecification_pack_TNE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TS { +static FUNC_PREFIX void d3q27storagespecification_pack_TS(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TN { +static FUNC_PREFIX void d3q27storagespecification_pack_TN(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BNW { +static FUNC_PREFIX void d3q27storagespecification_pack_BNW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TW { +static FUNC_PREFIX void d3q27storagespecification_pack_TW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BSE { +static FUNC_PREFIX void d3q27storagespecification_pack_BSE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_NW { +static FUNC_PREFIX void d3q27storagespecification_pack_NW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_S { +static FUNC_PREFIX void d3q27storagespecification_pack_S(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BS { +static FUNC_PREFIX void d3q27storagespecification_pack_BS(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_TSE { +static FUNC_PREFIX void d3q27storagespecification_pack_TSE(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_SW { +static FUNC_PREFIX void d3q27storagespecification_pack_SW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_BW { +static FUNC_PREFIX void d3q27storagespecification_pack_BW(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_pack_E { +static FUNC_PREFIX void d3q27storagespecification_pack_E(double * RESTRICT _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1) + { + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TSE { +static FUNC_PREFIX void d3q27storagespecification_unpack_TSE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_T { +static FUNC_PREFIX void d3q27storagespecification_unpack_T(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TN { +static FUNC_PREFIX void d3q27storagespecification_unpack_TN(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_SW { +static FUNC_PREFIX void d3q27storagespecification_unpack_SW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TNE { +static FUNC_PREFIX void d3q27storagespecification_unpack_TNE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BN { +static FUNC_PREFIX void d3q27storagespecification_unpack_BN(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_W { +static FUNC_PREFIX void d3q27storagespecification_unpack_W(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_E { +static FUNC_PREFIX void d3q27storagespecification_unpack_E(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BNE { +static FUNC_PREFIX void d3q27storagespecification_unpack_BNE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TNW { +static FUNC_PREFIX void d3q27storagespecification_unpack_TNW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BSE { +static FUNC_PREFIX void d3q27storagespecification_unpack_BSE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BSW { +static FUNC_PREFIX void d3q27storagespecification_unpack_BSW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_SE { +static FUNC_PREFIX void d3q27storagespecification_unpack_SE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_N { +static FUNC_PREFIX void d3q27storagespecification_unpack_N(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_NE { +static FUNC_PREFIX void d3q27storagespecification_unpack_NE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TE { +static FUNC_PREFIX void d3q27storagespecification_unpack_TE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_B { +static FUNC_PREFIX void d3q27storagespecification_unpack_B(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_NW { +static FUNC_PREFIX void d3q27storagespecification_unpack_NW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_S { +static FUNC_PREFIX void d3q27storagespecification_unpack_S(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TSW { +static FUNC_PREFIX void d3q27storagespecification_unpack_TSW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BE { +static FUNC_PREFIX void d3q27storagespecification_unpack_BE(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BS { +static FUNC_PREFIX void d3q27storagespecification_unpack_BS(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BW { +static FUNC_PREFIX void d3q27storagespecification_unpack_BW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TS { +static FUNC_PREFIX void d3q27storagespecification_unpack_TS(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_BNW { +static FUNC_PREFIX void d3q27storagespecification_unpack_BNW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_unpack_TW { +static FUNC_PREFIX void d3q27storagespecification_unpack_TW(const double * RESTRICT const _data_buffer, double * RESTRICT _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_SE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_SE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TS { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TS(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BNW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BNW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TSW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TSW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TNE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TNE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BS { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BS(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_W { +static FUNC_PREFIX void d3q27storagespecification_localCopy_W(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33; + double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TSE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TSE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_NE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_NE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_B { +static FUNC_PREFIX void d3q27storagespecification_localCopy_B(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36; + double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TNW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TNW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_NW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_NW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BN { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BN(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317; + double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_SW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_SW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_T { +static FUNC_PREFIX void d3q27storagespecification_localCopy_T(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35; + double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313; + double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BSW { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BSW(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_S { +static FUNC_PREFIX void d3q27storagespecification_localCopy_S(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32; + double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32; + double * RESTRICT _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39; + double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312; + double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312; + double * RESTRICT _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316; + double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322; + double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + double * RESTRICT _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326; + double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_TN { +static FUNC_PREFIX void d3q27storagespecification_localCopy_TN(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_E { +static FUNC_PREFIX void d3q27storagespecification_localCopy_E(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34; + double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310; + double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310; + double * RESTRICT _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314; + double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314; + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321; + double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_N { +static FUNC_PREFIX void d3q27storagespecification_localCopy_N(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31; + double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31; + double * RESTRICT _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37; + double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37; + double * RESTRICT _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38; + double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38; + double * RESTRICT _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311; + double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311; + double * RESTRICT _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315; + double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315; + double * RESTRICT _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319; + double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319; + double * RESTRICT _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320; + double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324; + double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BSE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BSE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + double * RESTRICT _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318; + double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318; + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + double * RESTRICT _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325; + double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + +namespace internal_d3q27storagespecification_localCopy_BNE { +static FUNC_PREFIX void d3q27storagespecification_localCopy_BNE(double * RESTRICT _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3) +{ + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1) + { + double * RESTRICT _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3; + double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323; + double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323; + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1) + { + _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2]; + } + } + } +} +} + + + + +/************************************************************************************* + * Kernel Wrappers +*************************************************************************************/ + +namespace walberla { +namespace lbm { + + void D3Q27StorageSpecification::PackKernels::packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const + { + double * buffer = reinterpret_cast<double*>(outBuffer); + double * RESTRICT _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + internal_d3q27storagespecification_pack_ALL::d3q27storagespecification_pack_ALL(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + } + + + void D3Q27StorageSpecification::PackKernels::unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const + { + double * buffer = reinterpret_cast<double*>(inBuffer); + double * RESTRICT const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + internal_d3q27storagespecification_unpack_ALL::d3q27storagespecification_unpack_ALL(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + } + + + void D3Q27StorageSpecification::PackKernels::localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + internal_d3q27storagespecification_localCopy_ALL::d3q27storagespecification_localCopy_ALL(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + } + + void D3Q27StorageSpecification::PackKernels::packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const + { + double * buffer = reinterpret_cast<double*>(outBuffer); + double * RESTRICT _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q27storagespecification_pack_N::d3q27storagespecification_pack_N(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::S : { + internal_d3q27storagespecification_pack_S::d3q27storagespecification_pack_S(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::W : { + internal_d3q27storagespecification_pack_W::d3q27storagespecification_pack_W(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::E : { + internal_d3q27storagespecification_pack_E::d3q27storagespecification_pack_E(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::T : { + internal_d3q27storagespecification_pack_T::d3q27storagespecification_pack_T(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::B : { + internal_d3q27storagespecification_pack_B::d3q27storagespecification_pack_B(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NW : { + internal_d3q27storagespecification_pack_NW::d3q27storagespecification_pack_NW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NE : { + internal_d3q27storagespecification_pack_NE::d3q27storagespecification_pack_NE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SW : { + internal_d3q27storagespecification_pack_SW::d3q27storagespecification_pack_SW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SE : { + internal_d3q27storagespecification_pack_SE::d3q27storagespecification_pack_SE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TN : { + internal_d3q27storagespecification_pack_TN::d3q27storagespecification_pack_TN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TS : { + internal_d3q27storagespecification_pack_TS::d3q27storagespecification_pack_TS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TW : { + internal_d3q27storagespecification_pack_TW::d3q27storagespecification_pack_TW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TE : { + internal_d3q27storagespecification_pack_TE::d3q27storagespecification_pack_TE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BN : { + internal_d3q27storagespecification_pack_BN::d3q27storagespecification_pack_BN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BS : { + internal_d3q27storagespecification_pack_BS::d3q27storagespecification_pack_BS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BW : { + internal_d3q27storagespecification_pack_BW::d3q27storagespecification_pack_BW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BE : { + internal_d3q27storagespecification_pack_BE::d3q27storagespecification_pack_BE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TNE : { + internal_d3q27storagespecification_pack_TNE::d3q27storagespecification_pack_TNE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TNW : { + internal_d3q27storagespecification_pack_TNW::d3q27storagespecification_pack_TNW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TSE : { + internal_d3q27storagespecification_pack_TSE::d3q27storagespecification_pack_TSE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TSW : { + internal_d3q27storagespecification_pack_TSW::d3q27storagespecification_pack_TSW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BNE : { + internal_d3q27storagespecification_pack_BNE::d3q27storagespecification_pack_BNE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BNW : { + internal_d3q27storagespecification_pack_BNW::d3q27storagespecification_pack_BNW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BSE : { + internal_d3q27storagespecification_pack_BSE::d3q27storagespecification_pack_BSE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BSW : { + internal_d3q27storagespecification_pack_BSW::d3q27storagespecification_pack_BSW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + }default: break; + } + } + + void D3Q27StorageSpecification::PackKernels::unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const + { + double * buffer = reinterpret_cast<double*>(inBuffer); + double * RESTRICT const _data_buffer = buffer; + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q27storagespecification_unpack_N::d3q27storagespecification_unpack_N(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::S : { + internal_d3q27storagespecification_unpack_S::d3q27storagespecification_unpack_S(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::W : { + internal_d3q27storagespecification_unpack_W::d3q27storagespecification_unpack_W(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::E : { + internal_d3q27storagespecification_unpack_E::d3q27storagespecification_unpack_E(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::T : { + internal_d3q27storagespecification_unpack_T::d3q27storagespecification_unpack_T(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::B : { + internal_d3q27storagespecification_unpack_B::d3q27storagespecification_unpack_B(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::NW : { + internal_d3q27storagespecification_unpack_NW::d3q27storagespecification_unpack_NW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::NE : { + internal_d3q27storagespecification_unpack_NE::d3q27storagespecification_unpack_NE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::SW : { + internal_d3q27storagespecification_unpack_SW::d3q27storagespecification_unpack_SW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::SE : { + internal_d3q27storagespecification_unpack_SE::d3q27storagespecification_unpack_SE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TN : { + internal_d3q27storagespecification_unpack_TN::d3q27storagespecification_unpack_TN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TS : { + internal_d3q27storagespecification_unpack_TS::d3q27storagespecification_unpack_TS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TW : { + internal_d3q27storagespecification_unpack_TW::d3q27storagespecification_unpack_TW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TE : { + internal_d3q27storagespecification_unpack_TE::d3q27storagespecification_unpack_TE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BN : { + internal_d3q27storagespecification_unpack_BN::d3q27storagespecification_unpack_BN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BS : { + internal_d3q27storagespecification_unpack_BS::d3q27storagespecification_unpack_BS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BW : { + internal_d3q27storagespecification_unpack_BW::d3q27storagespecification_unpack_BW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BE : { + internal_d3q27storagespecification_unpack_BE::d3q27storagespecification_unpack_BE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TNE : { + internal_d3q27storagespecification_unpack_TNE::d3q27storagespecification_unpack_TNE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TNW : { + internal_d3q27storagespecification_unpack_TNW::d3q27storagespecification_unpack_TNW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TSE : { + internal_d3q27storagespecification_unpack_TSE::d3q27storagespecification_unpack_TSE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::TSW : { + internal_d3q27storagespecification_unpack_TSW::d3q27storagespecification_unpack_TSW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BNE : { + internal_d3q27storagespecification_unpack_BNE::d3q27storagespecification_unpack_BNE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BNW : { + internal_d3q27storagespecification_unpack_BNW::d3q27storagespecification_unpack_BNW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BSE : { + internal_d3q27storagespecification_unpack_BSE::d3q27storagespecification_unpack_BSE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + } + case stencil::BSW : { + internal_d3q27storagespecification_unpack_BSW::d3q27storagespecification_unpack_BSW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3); + break; + }default: break; + } + } + + void D3Q27StorageSpecification::PackKernels::localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const + { + WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize()) + WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize()) + WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) + + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0)) + const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0)) + const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0)) + const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0); + const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride()); + const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride()); + const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride()); + const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride())); + const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride()); + const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride()); + const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride()); + const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride())); + switch (dir) { + case stencil::N : { + internal_d3q27storagespecification_localCopy_N::d3q27storagespecification_localCopy_N(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::S : { + internal_d3q27storagespecification_localCopy_S::d3q27storagespecification_localCopy_S(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::W : { + internal_d3q27storagespecification_localCopy_W::d3q27storagespecification_localCopy_W(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::E : { + internal_d3q27storagespecification_localCopy_E::d3q27storagespecification_localCopy_E(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::T : { + internal_d3q27storagespecification_localCopy_T::d3q27storagespecification_localCopy_T(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::B : { + internal_d3q27storagespecification_localCopy_B::d3q27storagespecification_localCopy_B(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NW : { + internal_d3q27storagespecification_localCopy_NW::d3q27storagespecification_localCopy_NW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::NE : { + internal_d3q27storagespecification_localCopy_NE::d3q27storagespecification_localCopy_NE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SW : { + internal_d3q27storagespecification_localCopy_SW::d3q27storagespecification_localCopy_SW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::SE : { + internal_d3q27storagespecification_localCopy_SE::d3q27storagespecification_localCopy_SE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TN : { + internal_d3q27storagespecification_localCopy_TN::d3q27storagespecification_localCopy_TN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TS : { + internal_d3q27storagespecification_localCopy_TS::d3q27storagespecification_localCopy_TS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TW : { + internal_d3q27storagespecification_localCopy_TW::d3q27storagespecification_localCopy_TW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TE : { + internal_d3q27storagespecification_localCopy_TE::d3q27storagespecification_localCopy_TE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BN : { + internal_d3q27storagespecification_localCopy_BN::d3q27storagespecification_localCopy_BN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BS : { + internal_d3q27storagespecification_localCopy_BS::d3q27storagespecification_localCopy_BS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BW : { + internal_d3q27storagespecification_localCopy_BW::d3q27storagespecification_localCopy_BW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BE : { + internal_d3q27storagespecification_localCopy_BE::d3q27storagespecification_localCopy_BE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TNE : { + internal_d3q27storagespecification_localCopy_TNE::d3q27storagespecification_localCopy_TNE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TNW : { + internal_d3q27storagespecification_localCopy_TNW::d3q27storagespecification_localCopy_TNW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TSE : { + internal_d3q27storagespecification_localCopy_TSE::d3q27storagespecification_localCopy_TSE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::TSW : { + internal_d3q27storagespecification_localCopy_TSW::d3q27storagespecification_localCopy_TSW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BNE : { + internal_d3q27storagespecification_localCopy_BNE::d3q27storagespecification_localCopy_BNE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BNW : { + internal_d3q27storagespecification_localCopy_BNW::d3q27storagespecification_localCopy_BNW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BSE : { + internal_d3q27storagespecification_localCopy_BSE::d3q27storagespecification_localCopy_BSE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + } + case stencil::BSW : { + internal_d3q27storagespecification_localCopy_BSW::d3q27storagespecification_localCopy_BSW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3); + break; + }default: break; + } + } + + +} // namespace lbm +} // namespace walberla \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h new file mode 100644 index 0000000000000000000000000000000000000000..49aa692873b51ba5f25590faf9faffd9a018afdc --- /dev/null +++ b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h @@ -0,0 +1,147 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q27StorageSpecification.h +//! \\author lbmpy +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" +#include "core/mpi/SendBuffer.h" +#include "core/mpi/RecvBuffer.h" + +#include "domain_decomposition/IBlock.h" +#include "field/GhostLayerField.h" + +#include "stencil/D3Q27.h" +#include "stencil/Directions.h" + +#define FUNC_PREFIX + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +namespace walberla +{ +namespace lbm{ + +class D3Q27StorageSpecification +{ + public: + // Used lattice stencil + using Stencil = stencil::D3Q27; + // Lattice stencil used for the communication (should be used to define which block directions need to be communicated) + using CommunicationStencil = stencil::D3Q27; + + // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997 + static const bool compressible = false; + // Cut off for the lattice Boltzmann equilibrium + static const int equilibriumAccuracyOrder = 2; + + // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push) + static const bool inplace = false; + + // If true the background deviation (rho_0 = 1) is subtracted for the collision step. + static const bool zeroCenteredPDFs = true; + // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho" + static const bool deviationOnlyEquilibrium = true; + + // Compute kernels to pack and unpack MPI buffers + class PackKernels { + + public: + using PdfField_T = field::GhostLayerField<double, 27>; + using value_type = typename PdfField_T::value_type; + + + + static const bool inplace = false; + + /** + * Packs all pdfs from the given cell interval to the send buffer. + * */ + void packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const; + + /** + * Unpacks all pdfs from the send buffer to the given cell interval. + * */ + void unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const; + + /** + * Copies data between two blocks on the same process. + * All pdfs from the sending interval are copied onto the receiving interval. + * */ + void localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const; + + /** + * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed. + * */ + void packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const; + + /** + * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval. + * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked. + * */ + void unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const; + + /** Copies data between two blocks on the same process. + * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval. + * */ + void localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const; + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packDirection / unpackDirection + * @param ci The cell interval + * @param dir The communication direction + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci, stencil::Direction dir) const { + return ci.numCells() * sizes[dir] * sizeof(value_type); + } + + /** + * Returns the number of bytes that will be packed from / unpacked to the cell interval + * when using packAll / unpackAll + * @param ci The cell interval + * @return The required size of the buffer, in bytes + * */ + uint_t size (CellInterval & ci) const { + return ci.numCells() * 27 * sizeof(value_type); + } + + + + private: + const uint_t sizes[27] { 0, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 }; + }; + +}; + +}} //lbm/walberla \ No newline at end of file diff --git a/src/lbm_generated/storage_specification/storage_specification_generation_script.py b/src/lbm_generated/storage_specification/storage_specification_generation_script.py new file mode 100644 index 0000000000000000000000000000000000000000..d7432ee70d6233edbd4c408199f1d89ae4fe1e6d --- /dev/null +++ b/src/lbm_generated/storage_specification/storage_specification_generation_script.py @@ -0,0 +1,32 @@ +import sympy as sp + +from pystencils import Target + +from lbmpy.creationfunctions import create_lb_method +from lbmpy import LBMConfig, Stencil, Method, LBStencil +from pystencils_walberla import ManualCodeGenerationContext, generate_info_header +from lbmpy_walberla.storage_specification import generate_lbm_storage_specification + + +with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False, + mpi=True, double_accuracy=True, cuda=False) as ctx: + + for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]: + target = Target.GPU if ctx.cuda else Target.CPU + data_type = "float64" if ctx.double_accuracy else "float32" + + method = Method.SRT + relaxation_rate = sp.symbols("omega") + streaming_pattern = 'pull' + nonuniform = False + + lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate, + streaming_pattern=streaming_pattern) + + lb_method = create_lb_method(lbm_config=lbm_config) + + storage_spec_name = f'{stencil.name}StorageSpecification' + generate_lbm_storage_specification(ctx, storage_spec_name, lb_method, lbm_config, + nonuniform=nonuniform, target=target, data_type=data_type) + + ctx.write_all_files() diff --git a/src/lbm_generated/sweep_collection/CMakeLists.txt b/src/lbm_generated/sweep_collection/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..91fbfb9d64fa55c4f870875be3c58a3f65a06c98 --- /dev/null +++ b/src/lbm_generated/sweep_collection/CMakeLists.txt @@ -0,0 +1,7 @@ +target_sources( lbm_generated + PRIVATE + D3Q19SRT.h + D3Q19SRT.cpp + D3Q27SRT.h + D3Q27SRT.cpp + ) \ No newline at end of file diff --git a/src/lbm_generated/sweep_collection/D3Q19SRT.cpp b/src/lbm_generated/sweep_collection/D3Q19SRT.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2ed08360d0699e51e3e47e1906727ef739a2e17 --- /dev/null +++ b/src/lbm_generated/sweep_collection/D3Q19SRT.cpp @@ -0,0 +1,1012 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q19SRT.cpp +//! \\author pystencils +//====================================================================================================================== +#include "D3Q19SRT.h" + +#define FUNC_PREFIX + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning push +#pragma warning( disable : 1599 ) +#endif + +using namespace std; + +namespace walberla { +namespace lbm { + + +namespace internal_d3q19srt_kernel_streamCollide { +static FUNC_PREFIX void d3q19srt_kernel_streamCollide(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, double omega) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double u_0 = vel0Term - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double u_1 = vel1Term - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double u_2 = vel2Term - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2); + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.33333333333333331 - 1.0*_data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*0.16666666666666666 - 1.0*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*-0.16666666666666666 - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*-0.16666666666666666 - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*0.16666666666666666 - 1.0*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*0.16666666666666666 - 1.0*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*-0.16666666666666666 - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*-0.083333333333333329 - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*0.083333333333333329 - 1.0*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*-0.083333333333333329 - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*0.083333333333333329 - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*0.083333333333333329 - 1.0*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*-0.083333333333333329 - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*-0.083333333333333329 - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*0.083333333333333329 - 1.0*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*0.083333333333333329 - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*-0.083333333333333329 - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*-0.083333333333333329 - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*0.083333333333333329 - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + } + } + } +} +} + + +namespace internal_d3q19srt_kernel_collide { +static FUNC_PREFIX void d3q19srt_kernel_collide(double * RESTRICT _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double xi_1 = _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]; + const double xi_2 = _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]; + const double xi_3 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double xi_4 = _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]; + const double xi_5 = _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double xi_6 = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]; + const double xi_7 = _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]; + const double xi_8 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]; + const double xi_9 = _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double xi_10 = _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]; + const double xi_11 = _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]; + const double xi_12 = _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double xi_13 = _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + const double xi_14 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]; + const double xi_15 = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]; + const double xi_16 = _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]; + const double xi_17 = _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]; + const double xi_18 = _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]; + const double xi_19 = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]; + const double vel0Term = xi_15 + xi_17 + xi_2 + xi_8 + xi_9; + const double vel1Term = xi_1 + xi_4 + xi_5 + xi_6; + const double vel2Term = xi_11 + xi_13 + xi_19; + const double delta_rho = vel0Term + vel1Term + vel2Term + xi_10 + xi_12 + xi_14 + xi_16 + xi_18 + xi_3 + xi_7; + const double u_0 = vel0Term + xi_11*-1.0 + xi_12*-1.0 + xi_14*-1.0 + xi_16*-1.0 + xi_5*-1.0; + const double u_1 = vel1Term + xi_12*-1.0 + xi_15*-1.0 + xi_18*-1.0 + xi_19*-1.0 + xi_7*-1.0 + xi_9; + const double u_2 = vel2Term + xi_1*-1.0 + xi_10*-1.0 + xi_16*-1.0 + xi_17 + xi_18*-1.0 + xi_2*-1.0 + xi_6; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2); + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.33333333333333331 + xi_3*-1.0) + xi_3; + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*0.16666666666666666 + xi_4*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_4; + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*-0.16666666666666666 + xi_7*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_7; + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*-0.16666666666666666 + xi_14*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_14; + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*0.16666666666666666 + xi_8*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_8; + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*0.16666666666666666 + xi_13*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_13; + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*-0.16666666666666666 + xi_10*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_10; + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*-0.083333333333333329 + xi_5*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + xi_5; + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*0.083333333333333329 + xi_9*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + xi_9; + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*-0.083333333333333329 + xi_12*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + xi_12; + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*0.083333333333333329 + xi_15*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + xi_15; + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*0.083333333333333329 + xi_6*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + xi_6; + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*-0.083333333333333329 + xi_19*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + xi_19; + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*-0.083333333333333329 + xi_11*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + xi_11; + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*0.083333333333333329 + xi_17*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + xi_17; + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*0.083333333333333329 + xi_1*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + xi_1; + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*-0.083333333333333329 + xi_18*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + xi_18; + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*-0.083333333333333329 + xi_16*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + xi_16; + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*0.083333333333333329 + xi_2*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + xi_2; + } + } + } +} +} + + +namespace internal_d3q19srt_kernel_stream { +static FUNC_PREFIX void d3q19srt_kernel_stream(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18; + } + } + } +} +} + + +namespace internal_d3q19srt_kernel_streamOnlyNoAdvancement { +static FUNC_PREFIX void d3q19srt_kernel_streamOnlyNoAdvancement(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18; + } + } + } +} +} + + +namespace internal_d3q19srt_kernel_initialise { +static FUNC_PREFIX void d3q19srt_kernel_initialise(double * RESTRICT const _data_density, double * RESTRICT _data_pdfs, double * RESTRICT const _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1) + { + double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2; + double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2; + double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3; + double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1) + { + double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30; + double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30; + double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31; + double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1) + { + const double rho = _data_density_20_30_10[_stride_density_0*ctr_0]; + const double delta_rho = rho - 1.0; + const double u_0 = _data_velocity_20_30_10[_stride_velocity_0*ctr_0]; + const double u_1 = _data_velocity_20_31_10[_stride_velocity_0*ctr_0]; + const double u_2 = _data_velocity_20_32_10[_stride_velocity_0*ctr_0]; + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = delta_rho*0.33333333333333331 - 0.33333333333333331*(u_0*u_0) - 0.33333333333333331*(u_1*u_1) - 0.33333333333333331*(u_2*u_2); + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_1*0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_1*u_1); + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_1*-0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_1*u_1); + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_0*-0.16666666666666666 - 0.16666666666666666*(u_1*u_1) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_0*u_0); + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_0*0.16666666666666666 - 0.16666666666666666*(u_1*u_1) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_0*u_0); + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_2*0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_1*u_1) + 0.16666666666666666*(u_2*u_2); + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_2*-0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_1*u_1) + 0.16666666666666666*(u_2*u_2); + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*-0.25 + u_0*-0.083333333333333329 + u_1*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1); + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*0.25 + u_0*0.083333333333333329 + u_1*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1); + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*0.25 + u_0*-0.083333333333333329 + u_1*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1); + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*-0.25 + u_0*0.083333333333333329 + u_1*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1); + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*0.25 + u_1*0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*-0.25 + u_1*-0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*-0.25 + u_0*-0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*0.25 + u_0*0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*-0.25 + u_1*0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*0.25 + u_1*-0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*0.25 + u_0*-0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2); + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*-0.25 + u_0*0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2); + } + } + } +} +} + + +namespace internal_d3q19srt_kernel_getter { +static FUNC_PREFIX void d3q19srt_kernel_getter(double * RESTRICT _data_density, double * RESTRICT const _data_pdfs, double * RESTRICT _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2; + double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2; + double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3; + double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30; + double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30; + double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31; + double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32; + for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1) + { + const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double momdensity_0 = vel0Term - 1.0*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double momdensity_1 = vel1Term - 1.0*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double momdensity_2 = vel2Term - 1.0*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]; + const double rho = delta_rho + 1.0; + const double u_0 = momdensity_0; + const double u_1 = momdensity_1; + const double u_2 = momdensity_2; + _data_density_20_30_10[_stride_density_0*ctr_0] = rho; + _data_velocity_20_30_10[_stride_velocity_0*ctr_0] = u_0; + _data_velocity_20_31_10[_stride_velocity_0*ctr_0] = u_1; + _data_velocity_20_32_10[_stride_velocity_0*ctr_0] = u_2; + } + } + } +} +} + + + + + +void D3Q19SRT::streamCollide( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_streamCollide::d3q19srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega); +} +void D3Q19SRT::streamCollideCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_streamCollide::d3q19srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega); +} + +void D3Q19SRT::collide( field::GhostLayerField<double, 19> * pdfs, double omega, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + internal_d3q19srt_kernel_collide::d3q19srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} +void D3Q19SRT::collideCellInterval( field::GhostLayerField<double, 19> * pdfs, double omega, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + internal_d3q19srt_kernel_collide::d3q19srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} + +void D3Q19SRT::stream( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_stream::d3q19srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} +void D3Q19SRT::streamCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_stream::d3q19srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} + +void D3Q19SRT::streamOnlyNoAdvancement( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_streamOnlyNoAdvancement::d3q19srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} +void D3Q19SRT::streamOnlyNoAdvancementCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q19srt_kernel_streamOnlyNoAdvancement::d3q19srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} + +void D3Q19SRT::initialise( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers())) + double * RESTRICT const _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT const _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers)) + const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers)) + const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers)) + const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q19srt_kernel_initialise::d3q19srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} +void D3Q19SRT::initialiseCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers())) + double * RESTRICT const _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q19srt_kernel_initialise::d3q19srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} + +void D3Q19SRT::calculateMacroscopicParameters( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers())) + double * RESTRICT _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers)) + const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers)) + const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers)) + const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q19srt_kernel_getter::d3q19srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} +void D3Q19SRT::calculateMacroscopicParametersCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers())) + double * RESTRICT _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q19srt_kernel_getter::d3q19srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} + + + +} // namespace lbm +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning pop +#endif \ No newline at end of file diff --git a/src/lbm_generated/sweep_collection/D3Q19SRT.h b/src/lbm_generated/sweep_collection/D3Q19SRT.h new file mode 100644 index 0000000000000000000000000000000000000000..2fdb3850cb000daf544b265fa4ae3808253ddc00 --- /dev/null +++ b/src/lbm_generated/sweep_collection/D3Q19SRT.h @@ -0,0 +1,1131 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q19SRT.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/logging/Logging.h" +#include "core/Macros.h" + + + +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "domain_decomposition/StructuredBlockStorage.h" + +#include "field/SwapableCompare.h" +#include "field/GhostLayerField.h" + +#include <set> +#include <cmath> + + + +using namespace std::placeholders; + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +# pragma GCC diagnostic ignored "-Wreorder" +#endif + +namespace walberla { +namespace lbm { + + +class D3Q19SRT +{ +public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + D3Q19SRT(const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID pdfsID_, BlockDataID densityID_, BlockDataID velocityID_, double omega, const Cell & outerWidth=Cell(1, 1, 1)) + : blocks_(blocks), pdfsID(pdfsID_), densityID(densityID_), velocityID(velocityID_), omega_(omega), outerWidth_(outerWidth) + { + + + for (auto& iBlock : *blocks) + { + if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 || + int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 || + int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2) + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") + } + }; + + + ~D3Q19SRT() { + for(auto p: cache_pdfs_) { + delete p; + } + } + + + /************************************************************************************* + * Internal Function Definitions with raw Pointer + *************************************************************************************/ + static void streamCollide (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const cell_idx_t ghost_layers = 0); + static void streamCollideCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const CellInterval & ci); + + static void collide (field::GhostLayerField<double, 19> * pdfs, double omega, const cell_idx_t ghost_layers = 0); + static void collideCellInterval (field::GhostLayerField<double, 19> * pdfs, double omega, const CellInterval & ci); + + static void stream (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers = 0); + static void streamCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci); + + static void streamOnlyNoAdvancement (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers = 0); + static void streamOnlyNoAdvancementCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci); + + static void initialise (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0); + static void initialiseCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci); + + static void calculateMacroscopicParameters (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0); + static void calculateMacroscopicParametersCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci); + + + /************************************************************************************* + * Function Definitions for external Usage + *************************************************************************************/ + + std::function<void (IBlock *)> streamCollide() + { + return [this](IBlock* block) { streamCollide(block); }; + } + + std::function<void (IBlock *)> streamCollide(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamCollideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamCollideOuter(block); }; + default: + return [this](IBlock* block) { streamCollide(block); }; + } + } + + std::function<void (IBlock *)> streamCollide(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamCollideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamCollideOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { streamCollide(block, ghost_layers); }; + } + } + + + + void streamCollide(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollide(pdfs, pdfs_tmp, omega, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamCollide(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollide(pdfs, pdfs_tmp, omega, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + + + void streamCollideCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamCollideInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamCollideCellInterval(pdfs, pdfs_tmp, omega, inner); + } + + void streamCollideOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci); + } + + + pdfs->swapDataPointers(pdfs_tmp); + + } + + + std::function<void (IBlock *)> collide() + { + return [this](IBlock* block) { collide(block); }; + } + + std::function<void (IBlock *)> collide(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { collideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { collideOuter(block); }; + default: + return [this](IBlock* block) { collide(block); }; + } + } + + std::function<void (IBlock *)> collide(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { collideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { collideOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { collide(block, ghost_layers); }; + } + } + + + + void collide(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + auto & omega = this->omega_; + + collide(pdfs, omega, ghost_layers); + + } + + void collide(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + auto & omega = this->omega_; + + collide(pdfs, omega, ghost_layers); + + } + + + + void collideCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + auto & omega = this->omega_; + + collideCellInterval(pdfs, omega, ci); + + } + + void collideInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + auto & omega = this->omega_; + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + collideCellInterval(pdfs, omega, inner); + } + + void collideOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + + auto & omega = this->omega_; + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + collideCellInterval(pdfs, omega, ci); + } + + + + } + + + std::function<void (IBlock *)> stream() + { + return [this](IBlock* block) { stream(block); }; + } + + std::function<void (IBlock *)> stream(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOuter(block); }; + default: + return [this](IBlock* block) { stream(block); }; + } + } + + std::function<void (IBlock *)> stream(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { stream(block, ghost_layers); }; + } + } + + + + void stream(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + stream(pdfs, pdfs_tmp, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void stream(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + stream(pdfs, pdfs_tmp, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + + + void streamCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamCellInterval(pdfs, pdfs_tmp, ci); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamCellInterval(pdfs, pdfs_tmp, inner); + } + + void streamOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamCellInterval(pdfs, pdfs_tmp, ci); + } + + + pdfs->swapDataPointers(pdfs_tmp); + + } + + + std::function<void (IBlock *)> streamOnlyNoAdvancement() + { + return [this](IBlock* block) { streamOnlyNoAdvancement(block); }; + } + + std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); }; + default: + return [this](IBlock* block) { streamOnlyNoAdvancement(block); }; + } + } + + std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { streamOnlyNoAdvancement(block, ghost_layers); }; + } + } + + + + void streamOnlyNoAdvancement(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers); + + } + + void streamOnlyNoAdvancement(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers); + + } + + + + void streamOnlyNoAdvancementCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci); + + } + + void streamOnlyNoAdvancementInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, inner); + } + + void streamOnlyNoAdvancementOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + field::GhostLayerField<double, 19> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci); + } + + + + } + + + std::function<void (IBlock *)> initialise() + { + return [this](IBlock* block) { initialise(block); }; + } + + std::function<void (IBlock *)> initialise(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { initialiseInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { initialiseOuter(block); }; + default: + return [this](IBlock* block) { initialise(block); }; + } + } + + std::function<void (IBlock *)> initialise(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { initialiseInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { initialiseOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { initialise(block, ghost_layers); }; + } + } + + + + void initialise(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialise(density, pdfs, velocity, ghost_layers); + + } + + void initialise(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialise(density, pdfs, velocity, ghost_layers); + + } + + + + void initialiseCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialiseCellInterval(density, pdfs, velocity, ci); + + } + + void initialiseInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + CellInterval inner = density->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + initialiseCellInterval(density, pdfs, velocity, inner); + } + + void initialiseOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + if( layers_.empty() ) + { + CellInterval ci; + + density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + initialiseCellInterval(density, pdfs, velocity, ci); + } + + + + } + + + std::function<void (IBlock *)> calculateMacroscopicParameters() + { + return [this](IBlock* block) { calculateMacroscopicParameters(block); }; + } + + std::function<void (IBlock *)> calculateMacroscopicParameters(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { calculateMacroscopicParametersInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); }; + default: + return [this](IBlock* block) { calculateMacroscopicParameters(block); }; + } + } + + std::function<void (IBlock *)> calculateMacroscopicParameters(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { calculateMacroscopicParametersInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { calculateMacroscopicParameters(block, ghost_layers); }; + } + } + + + + void calculateMacroscopicParameters(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers); + + } + + void calculateMacroscopicParameters(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers); + + } + + + + void calculateMacroscopicParametersCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci); + + } + + void calculateMacroscopicParametersInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + CellInterval inner = density->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, inner); + } + + void calculateMacroscopicParametersOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID); + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + if( layers_.empty() ) + { + CellInterval ci; + + density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci); + } + + + + } + + + + + private: + shared_ptr< StructuredBlockStorage > blocks_; + BlockDataID pdfsID; + BlockDataID densityID; + BlockDataID velocityID; + double omega_; + + private: std::set< field::GhostLayerField<double, 19> *, field::SwapableCompare< field::GhostLayerField<double, 19> * > > cache_pdfs_; + + Cell outerWidth_; + std::vector<CellInterval> layers_; + + +}; + + +} // namespace lbm +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/src/lbm_generated/sweep_collection/D3Q27SRT.cpp b/src/lbm_generated/sweep_collection/D3Q27SRT.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ce89749fc60ab603f3172992cb46c65242e57d16 --- /dev/null +++ b/src/lbm_generated/sweep_collection/D3Q27SRT.cpp @@ -0,0 +1,1220 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q27SRT.cpp +//! \\author pystencils +//====================================================================================================================== +#include "D3Q27SRT.h" + +#define FUNC_PREFIX + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning push +#pragma warning( disable : 1599 ) +#endif + +using namespace std; + +namespace walberla { +namespace lbm { + + +namespace internal_d3q27srt_kernel_streamCollide { +static FUNC_PREFIX void d3q27srt_kernel_streamCollide(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, double omega) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321; + double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319; + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325; + double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + double * RESTRICT _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319; + double * RESTRICT _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320; + double * RESTRICT _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321; + double * RESTRICT _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322; + double * RESTRICT _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323; + double * RESTRICT _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324; + double * RESTRICT _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325; + double * RESTRICT _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double u_0 = vel0Term - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double u_1 = vel1Term - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double u_2 = vel2Term - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2); + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.29629629629629628 - 1.0*_data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*0.22222222222222221 - 1.0*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*-0.22222222222222221 - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*-0.22222222222222221 - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*0.22222222222222221 - 1.0*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*0.22222222222222221 - 1.0*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*-0.22222222222222221 - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*-0.055555555555555552 - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Mu1*u0Mu1)) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*0.055555555555555552 - 1.0*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Pu1*u0Pu1)) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*-0.055555555555555552 - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Pu1*u0Pu1)) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*0.055555555555555552 - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Mu1*u0Mu1)) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*0.055555555555555552 - 1.0*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*-0.055555555555555552 - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*-0.055555555555555552 - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Mu2*u0Mu2)) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*0.055555555555555552 - 1.0*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Pu2*u0Pu2)) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*0.055555555555555552 - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Mu2*u1Mu2)) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*-0.055555555555555552 - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Pu2*u1Pu2)) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*-0.055555555555555552 - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Pu2*u0Pu2)) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*0.055555555555555552 - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Mu2*u0Mu2)) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + } + } + } +} +} + + +namespace internal_d3q27srt_kernel_collide { +static FUNC_PREFIX void d3q27srt_kernel_collide(double * RESTRICT _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326; + double * RESTRICT _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323; + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * RESTRICT _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double xi_1 = _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0]; + const double xi_2 = _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0]; + const double xi_3 = _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0]; + const double xi_4 = _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0]; + const double xi_5 = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0]; + const double xi_6 = _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0]; + const double xi_7 = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0]; + const double xi_8 = _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0]; + const double xi_9 = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0]; + const double xi_10 = _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0]; + const double xi_11 = _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double xi_12 = _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0]; + const double xi_13 = _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0]; + const double xi_14 = _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double xi_15 = _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0]; + const double xi_16 = _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0]; + const double xi_17 = _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0]; + const double xi_18 = _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double xi_19 = _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + const double xi_20 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double xi_21 = _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0]; + const double xi_22 = _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0]; + const double xi_23 = _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0]; + const double xi_24 = _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0]; + const double xi_25 = _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0]; + const double xi_26 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0]; + const double xi_27 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0]; + const double vel0Term = xi_12 + xi_14 + xi_15 + xi_24 + xi_25 + xi_26 + xi_4 + xi_5 + xi_8; + const double vel1Term = xi_10 + xi_11 + xi_13 + xi_17 + xi_21 + xi_9; + const double vel2Term = xi_1 + xi_19 + xi_22 + xi_7; + const double delta_rho = vel0Term + vel1Term + vel2Term + xi_16 + xi_18 + xi_2 + xi_20 + xi_23 + xi_27 + xi_3 + xi_6; + const double u_0 = vel0Term + xi_1*-1.0 + xi_10*-1.0 + xi_11*-1.0 + xi_17*-1.0 + xi_18*-1.0 + xi_22*-1.0 + xi_23*-1.0 + xi_27*-1.0 + xi_3*-1.0; + const double u_1 = vel1Term + xi_12 + xi_14 + xi_15*-1.0 + xi_16*-1.0 + xi_18*-1.0 + xi_22*-1.0 + xi_3*-1.0 + xi_4 + xi_5*-1.0 + xi_6*-1.0 + xi_7*-1.0 + xi_8*-1.0; + const double u_2 = vel2Term + xi_10*-1.0 + xi_12 + xi_15*-1.0 + xi_17 + xi_2*-1.0 + xi_21*-1.0 + xi_23*-1.0 + xi_24 + xi_25*-1.0 + xi_3*-1.0 + xi_4*-1.0 + xi_6*-1.0 + xi_8 + xi_9; + const double u0Mu1 = u_0 + u_1*-1.0; + const double u0Pu1 = u_0 + u_1; + const double u1Pu2 = u_1 + u_2; + const double u1Mu2 = u_1 + u_2*-1.0; + const double u0Mu2 = u_0 + u_2*-1.0; + const double u0Pu2 = u_0 + u_2; + const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2); + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.29629629629629628 + xi_20*-1.0) + xi_20; + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*0.22222222222222221 + xi_13*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_13; + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*-0.22222222222222221 + xi_16*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_16; + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*-0.22222222222222221 + xi_27*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_27; + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*0.22222222222222221 + xi_26*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_26; + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*0.22222222222222221 + xi_19*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_19; + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*-0.22222222222222221 + xi_2*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_2; + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*-0.055555555555555552 + xi_11*-1.0 + 0.083333333333333329*(u0Mu1*u0Mu1)) + xi_11; + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*0.055555555555555552 + xi_14*-1.0 + 0.083333333333333329*(u0Pu1*u0Pu1)) + xi_14; + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*-0.055555555555555552 + xi_18*-1.0 + 0.083333333333333329*(u0Pu1*u0Pu1)) + xi_18; + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*0.055555555555555552 + xi_5*-1.0 + 0.083333333333333329*(u0Mu1*u0Mu1)) + xi_5; + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*0.055555555555555552 + xi_9*-1.0 + 0.083333333333333329*(u1Pu2*u1Pu2)) + xi_9; + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*-0.055555555555555552 + xi_7*-1.0 + 0.083333333333333329*(u1Mu2*u1Mu2)) + xi_7; + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*-0.055555555555555552 + xi_1*-1.0 + 0.083333333333333329*(u0Mu2*u0Mu2)) + xi_1; + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*0.055555555555555552 + xi_24*-1.0 + 0.083333333333333329*(u0Pu2*u0Pu2)) + xi_24; + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*0.055555555555555552 + xi_21*-1.0 + 0.083333333333333329*(u1Mu2*u1Mu2)) + xi_21; + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*-0.055555555555555552 + xi_6*-1.0 + 0.083333333333333329*(u1Pu2*u1Pu2)) + xi_6; + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*-0.055555555555555552 + xi_23*-1.0 + 0.083333333333333329*(u0Pu2*u0Pu2)) + xi_23; + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*0.055555555555555552 + xi_25*-1.0 + 0.083333333333333329*(u0Mu2*u0Mu2)) + xi_25; + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*0.013888888888888888 + xi_12*-1.0 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_12; + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888 + xi_17*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_17; + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*0.013888888888888888 + xi_8*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_8; + _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888 + xi_22*-1.0 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_22; + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888 + xi_4*-1.0 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_4; + _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888 + xi_10*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_10; + _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888 + xi_15*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_15; + _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888 + xi_3*-1.0 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_3; + } + } + } +} +} + + +namespace internal_d3q27srt_kernel_stream { +static FUNC_PREFIX void d3q27srt_kernel_stream(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319; + double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320; + double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321; + double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322; + double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323; + double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324; + double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325; + double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + double * RESTRICT _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319; + double * RESTRICT _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320; + double * RESTRICT _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321; + double * RESTRICT _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322; + double * RESTRICT _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323; + double * RESTRICT _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324; + double * RESTRICT _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325; + double * RESTRICT _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_19 = _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_20 = _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_21 = _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_22 = _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_23 = _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_24 = _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_25 = _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_26 = _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18; + _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = streamed_19; + _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = streamed_20; + _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = streamed_21; + _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = streamed_22; + _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = streamed_23; + _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = streamed_24; + _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = streamed_25; + _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = streamed_26; + } + } + } +} +} + + +namespace internal_d3q27srt_kernel_streamOnlyNoAdvancement { +static FUNC_PREFIX void d3q27srt_kernel_streamOnlyNoAdvancement(double * RESTRICT const _data_pdfs, double * RESTRICT _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3; + double * RESTRICT _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2; + double * RESTRICT _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3; + double * RESTRICT _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35; + double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36; + double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311; + double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312; + double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313; + double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314; + double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315; + double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316; + double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317; + double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318; + double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319; + double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320; + double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321; + double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322; + double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323; + double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324; + double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325; + double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326; + double * RESTRICT _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30; + double * RESTRICT _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31; + double * RESTRICT _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32; + double * RESTRICT _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33; + double * RESTRICT _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34; + double * RESTRICT _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35; + double * RESTRICT _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36; + double * RESTRICT _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37; + double * RESTRICT _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38; + double * RESTRICT _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39; + double * RESTRICT _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310; + double * RESTRICT _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311; + double * RESTRICT _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312; + double * RESTRICT _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313; + double * RESTRICT _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314; + double * RESTRICT _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315; + double * RESTRICT _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316; + double * RESTRICT _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317; + double * RESTRICT _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318; + double * RESTRICT _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319; + double * RESTRICT _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320; + double * RESTRICT _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321; + double * RESTRICT _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322; + double * RESTRICT _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323; + double * RESTRICT _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324; + double * RESTRICT _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325; + double * RESTRICT _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326; + for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1) + { + const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]; + const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0]; + const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0]; + const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0]; + const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0]; + const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0]; + const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0]; + const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_19 = _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_20 = _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_21 = _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_22 = _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_23 = _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_24 = _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + const double streamed_25 = _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0]; + const double streamed_26 = _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0]; + _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0; + _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1; + _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2; + _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3; + _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4; + _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5; + _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6; + _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7; + _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8; + _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9; + _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10; + _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11; + _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12; + _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13; + _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14; + _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15; + _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16; + _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17; + _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18; + _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = streamed_19; + _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = streamed_20; + _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = streamed_21; + _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = streamed_22; + _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = streamed_23; + _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = streamed_24; + _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = streamed_25; + _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = streamed_26; + } + } + } +} +} + + +namespace internal_d3q27srt_kernel_initialise { +static FUNC_PREFIX void d3q27srt_kernel_initialise(double * RESTRICT const _data_density, double * RESTRICT _data_pdfs, double * RESTRICT const _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1) + { + double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2; + double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2; + double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3; + double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1) + { + double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30; + double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30; + double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31; + double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * RESTRICT _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319; + double * RESTRICT _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320; + double * RESTRICT _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321; + double * RESTRICT _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322; + double * RESTRICT _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323; + double * RESTRICT _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324; + double * RESTRICT _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325; + double * RESTRICT _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326; + for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1) + { + const double rho = _data_density_20_30_10[_stride_density_0*ctr_0]; + const double delta_rho = rho - 1.0; + const double u_0 = _data_velocity_20_30_10[_stride_velocity_0*ctr_0]; + const double u_1 = _data_velocity_20_31_10[_stride_velocity_0*ctr_0]; + const double u_2 = _data_velocity_20_32_10[_stride_velocity_0*ctr_0]; + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = delta_rho*0.29629629629629628 - 0.44444444444444442*(u_0*u_0) - 0.44444444444444442*(u_1*u_1) - 0.44444444444444442*(u_2*u_2); + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_1*0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_1*u_1); + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_1*-0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_1*u_1); + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_0*-0.22222222222222221 - 0.1111111111111111*(u_1*u_1) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_0*u_0); + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_0*0.22222222222222221 - 0.1111111111111111*(u_1*u_1) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_0*u_0); + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_2*0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_1*u_1) + 0.22222222222222221*(u_2*u_2); + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_2*-0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_1*u_1) + 0.22222222222222221*(u_2*u_2); + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*-0.16666666666666666 + u_0*-0.055555555555555552 + u_1*0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1); + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*0.16666666666666666 + u_0*0.055555555555555552 + u_1*0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1); + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*0.16666666666666666 + u_0*-0.055555555555555552 + u_1*-0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1); + _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*-0.16666666666666666 + u_0*0.055555555555555552 + u_1*-0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1); + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*0.16666666666666666 + u_1*0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*-0.16666666666666666 + u_1*-0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*-0.16666666666666666 + u_0*-0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*0.16666666666666666 + u_0*0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*-0.16666666666666666 + u_1*0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*0.16666666666666666 + u_1*-0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*0.16666666666666666 + u_0*-0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*-0.16666666666666666 + u_0*0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2); + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*-0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*-0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*-0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*-0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2); + } + } + } +} +} + + +namespace internal_d3q27srt_kernel_getter { +static FUNC_PREFIX void d3q27srt_kernel_getter(double * RESTRICT _data_density, double * RESTRICT const _data_pdfs, double * RESTRICT _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3) +{ + for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1) + { + double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3; + double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3; + double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2; + double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3; + double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2; + double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2; + double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3; + double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3; + for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1) + { + double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310; + double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314; + double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318; + double * RESTRICT _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319; + double * RESTRICT _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321; + double * RESTRICT _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323; + double * RESTRICT _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325; + double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34; + double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38; + double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313; + double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317; + double * RESTRICT _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320; + double * RESTRICT _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322; + double * RESTRICT _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324; + double * RESTRICT _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326; + double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33; + double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37; + double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39; + double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31; + double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311; + double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315; + double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312; + double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316; + double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32; + double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35; + double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30; + double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36; + double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30; + double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30; + double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31; + double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32; + for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1) + { + const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double momdensity_0 = vel0Term - 1.0*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0]; + const double momdensity_1 = vel1Term - 1.0*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0]; + const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0]; + const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0]; + const double momdensity_2 = vel2Term - 1.0*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0]; + const double rho = delta_rho + 1.0; + const double u_0 = momdensity_0; + const double u_1 = momdensity_1; + const double u_2 = momdensity_2; + _data_density_20_30_10[_stride_density_0*ctr_0] = rho; + _data_velocity_20_30_10[_stride_velocity_0*ctr_0] = u_0; + _data_velocity_20_31_10[_stride_velocity_0*ctr_0] = u_1; + _data_velocity_20_32_10[_stride_velocity_0*ctr_0] = u_2; + } + } + } +} +} + + + + + +void D3Q27SRT::streamCollide( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_streamCollide::d3q27srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega); +} +void D3Q27SRT::streamCollideCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_streamCollide::d3q27srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega); +} + +void D3Q27SRT::collide( field::GhostLayerField<double, 27> * pdfs, double omega, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + internal_d3q27srt_kernel_collide::d3q27srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} +void D3Q27SRT::collideCellInterval( field::GhostLayerField<double, 27> * pdfs, double omega, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + internal_d3q27srt_kernel_collide::d3q27srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega); +} + +void D3Q27SRT::stream( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_stream::d3q27srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} +void D3Q27SRT::streamCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_stream::d3q27srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} + +void D3Q27SRT::streamOnlyNoAdvancement( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers)) + const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers)) + const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_streamOnlyNoAdvancement::d3q27srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} +void D3Q27SRT::streamOnlyNoAdvancementCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers())) + double * RESTRICT _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride()); + const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride()); + const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride()); + const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride())); + internal_d3q27srt_kernel_streamOnlyNoAdvancement::d3q27srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3); +} + +void D3Q27SRT::initialise( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers())) + double * RESTRICT const _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT const _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers)) + const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers)) + const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers)) + const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q27srt_kernel_initialise::d3q27srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} +void D3Q27SRT::initialiseCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers())) + double * RESTRICT const _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q27srt_kernel_initialise::d3q27srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} + +void D3Q27SRT::calculateMacroscopicParameters( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers ) +{ + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers())) + double * RESTRICT _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers)) + const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers)) + const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers)) + const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q27srt_kernel_getter::d3q27srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} +void D3Q27SRT::calculateMacroscopicParametersCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci) +{ + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers())) + double * RESTRICT _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers())) + double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers())) + WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers())) + double * RESTRICT _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0)) + const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0)) + const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0); + WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0)) + const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0); + const int64_t _stride_density_0 = int64_t(density->xStride()); + const int64_t _stride_density_1 = int64_t(density->yStride()); + const int64_t _stride_density_2 = int64_t(density->zStride()); + const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride()); + const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride()); + const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride()); + const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride())); + const int64_t _stride_velocity_0 = int64_t(velocity->xStride()); + const int64_t _stride_velocity_1 = int64_t(velocity->yStride()); + const int64_t _stride_velocity_2 = int64_t(velocity->zStride()); + const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride())); + internal_d3q27srt_kernel_getter::d3q27srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3); +} + + + +} // namespace lbm +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL ) +#pragma warning pop +#endif \ No newline at end of file diff --git a/src/lbm_generated/sweep_collection/D3Q27SRT.h b/src/lbm_generated/sweep_collection/D3Q27SRT.h new file mode 100644 index 0000000000000000000000000000000000000000..eb45b71660fbf902d16cd064e2f09dadf24548d7 --- /dev/null +++ b/src/lbm_generated/sweep_collection/D3Q27SRT.h @@ -0,0 +1,1131 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file D3Q27SRT.h +//! \\author pystencils +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/logging/Logging.h" +#include "core/Macros.h" + + + +#include "domain_decomposition/BlockDataID.h" +#include "domain_decomposition/IBlock.h" +#include "domain_decomposition/StructuredBlockStorage.h" + +#include "field/SwapableCompare.h" +#include "field/GhostLayerField.h" + +#include <set> +#include <cmath> + + + +using namespace std::placeholders; + +#ifdef __GNUC__ +#define RESTRICT __restrict__ +#elif _MSC_VER +#define RESTRICT __restrict +#else +#define RESTRICT +#endif + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +# pragma GCC diagnostic ignored "-Wreorder" +#endif + +namespace walberla { +namespace lbm { + + +class D3Q27SRT +{ +public: + enum Type { ALL = 0, INNER = 1, OUTER = 2 }; + + D3Q27SRT(const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID pdfsID_, BlockDataID densityID_, BlockDataID velocityID_, double omega, const Cell & outerWidth=Cell(1, 1, 1)) + : blocks_(blocks), pdfsID(pdfsID_), densityID(densityID_), velocityID(velocityID_), omega_(omega), outerWidth_(outerWidth) + { + + + for (auto& iBlock : *blocks) + { + if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 || + int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 || + int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2) + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") + } + }; + + + ~D3Q27SRT() { + for(auto p: cache_pdfs_) { + delete p; + } + } + + + /************************************************************************************* + * Internal Function Definitions with raw Pointer + *************************************************************************************/ + static void streamCollide (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const cell_idx_t ghost_layers = 0); + static void streamCollideCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const CellInterval & ci); + + static void collide (field::GhostLayerField<double, 27> * pdfs, double omega, const cell_idx_t ghost_layers = 0); + static void collideCellInterval (field::GhostLayerField<double, 27> * pdfs, double omega, const CellInterval & ci); + + static void stream (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers = 0); + static void streamCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci); + + static void streamOnlyNoAdvancement (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers = 0); + static void streamOnlyNoAdvancementCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci); + + static void initialise (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0); + static void initialiseCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci); + + static void calculateMacroscopicParameters (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0); + static void calculateMacroscopicParametersCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci); + + + /************************************************************************************* + * Function Definitions for external Usage + *************************************************************************************/ + + std::function<void (IBlock *)> streamCollide() + { + return [this](IBlock* block) { streamCollide(block); }; + } + + std::function<void (IBlock *)> streamCollide(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamCollideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamCollideOuter(block); }; + default: + return [this](IBlock* block) { streamCollide(block); }; + } + } + + std::function<void (IBlock *)> streamCollide(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamCollideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamCollideOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { streamCollide(block, ghost_layers); }; + } + } + + + + void streamCollide(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollide(pdfs, pdfs_tmp, omega, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamCollide(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollide(pdfs, pdfs_tmp, omega, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + + + void streamCollideCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamCollideInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamCollideCellInterval(pdfs, pdfs_tmp, omega, inner); + } + + void streamCollideOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + auto & omega = this->omega_; + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci); + } + + + pdfs->swapDataPointers(pdfs_tmp); + + } + + + std::function<void (IBlock *)> collide() + { + return [this](IBlock* block) { collide(block); }; + } + + std::function<void (IBlock *)> collide(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { collideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { collideOuter(block); }; + default: + return [this](IBlock* block) { collide(block); }; + } + } + + std::function<void (IBlock *)> collide(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { collideInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { collideOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { collide(block, ghost_layers); }; + } + } + + + + void collide(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + auto & omega = this->omega_; + + collide(pdfs, omega, ghost_layers); + + } + + void collide(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + auto & omega = this->omega_; + + collide(pdfs, omega, ghost_layers); + + } + + + + void collideCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + auto & omega = this->omega_; + + collideCellInterval(pdfs, omega, ci); + + } + + void collideInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + auto & omega = this->omega_; + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + collideCellInterval(pdfs, omega, inner); + } + + void collideOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + + auto & omega = this->omega_; + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + collideCellInterval(pdfs, omega, ci); + } + + + + } + + + std::function<void (IBlock *)> stream() + { + return [this](IBlock* block) { stream(block); }; + } + + std::function<void (IBlock *)> stream(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOuter(block); }; + default: + return [this](IBlock* block) { stream(block); }; + } + } + + std::function<void (IBlock *)> stream(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { stream(block, ghost_layers); }; + } + } + + + + void stream(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + stream(pdfs, pdfs_tmp, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void stream(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + stream(pdfs, pdfs_tmp, ghost_layers); + pdfs->swapDataPointers(pdfs_tmp); + + } + + + + void streamCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamCellInterval(pdfs, pdfs_tmp, ci); + pdfs->swapDataPointers(pdfs_tmp); + + } + + void streamInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamCellInterval(pdfs, pdfs_tmp, inner); + } + + void streamOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamCellInterval(pdfs, pdfs_tmp, ci); + } + + + pdfs->swapDataPointers(pdfs_tmp); + + } + + + std::function<void (IBlock *)> streamOnlyNoAdvancement() + { + return [this](IBlock* block) { streamOnlyNoAdvancement(block); }; + } + + std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); }; + default: + return [this](IBlock* block) { streamOnlyNoAdvancement(block); }; + } + } + + std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { streamOnlyNoAdvancement(block, ghost_layers); }; + } + } + + + + void streamOnlyNoAdvancement(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers); + + } + + void streamOnlyNoAdvancement(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers); + + } + + + + void streamOnlyNoAdvancementCellInterval(IBlock * block, const CellInterval & ci) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci); + + } + + void streamOnlyNoAdvancementInner(IBlock * block) + { + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + CellInterval inner = pdfs->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, inner); + } + + void streamOnlyNoAdvancementOuter(IBlock * block) + { + + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + field::GhostLayerField<double, 27> * pdfs_tmp; + { + // Getting temporary field pdfs_tmp + auto it = cache_pdfs_.find( pdfs ); + if( it != cache_pdfs_.end() ) + { + pdfs_tmp = *it; + } + else + { + pdfs_tmp = pdfs->cloneUninitialized(); + cache_pdfs_.insert(pdfs_tmp); + } + } + + + + + if( layers_.empty() ) + { + CellInterval ci; + + pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci); + } + + + + } + + + std::function<void (IBlock *)> initialise() + { + return [this](IBlock* block) { initialise(block); }; + } + + std::function<void (IBlock *)> initialise(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { initialiseInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { initialiseOuter(block); }; + default: + return [this](IBlock* block) { initialise(block); }; + } + } + + std::function<void (IBlock *)> initialise(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { initialiseInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { initialiseOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { initialise(block, ghost_layers); }; + } + } + + + + void initialise(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialise(density, pdfs, velocity, ghost_layers); + + } + + void initialise(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialise(density, pdfs, velocity, ghost_layers); + + } + + + + void initialiseCellInterval(IBlock * block, const CellInterval & ci) + { + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + initialiseCellInterval(density, pdfs, velocity, ci); + + } + + void initialiseInner(IBlock * block) + { + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + CellInterval inner = density->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + initialiseCellInterval(density, pdfs, velocity, inner); + } + + void initialiseOuter(IBlock * block) + { + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + if( layers_.empty() ) + { + CellInterval ci; + + density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + initialiseCellInterval(density, pdfs, velocity, ci); + } + + + + } + + + std::function<void (IBlock *)> calculateMacroscopicParameters() + { + return [this](IBlock* block) { calculateMacroscopicParameters(block); }; + } + + std::function<void (IBlock *)> calculateMacroscopicParameters(Type type) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { calculateMacroscopicParametersInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); }; + default: + return [this](IBlock* block) { calculateMacroscopicParameters(block); }; + } + } + + std::function<void (IBlock *)> calculateMacroscopicParameters(Type type, const cell_idx_t ghost_layers) + { + switch (type) + { + case Type::INNER: + return [this](IBlock* block) { calculateMacroscopicParametersInner(block); }; + case Type::OUTER: + return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); }; + default: + return [this, ghost_layers](IBlock* block) { calculateMacroscopicParameters(block, ghost_layers); }; + } + } + + + + void calculateMacroscopicParameters(IBlock * block) + { + const cell_idx_t ghost_layers = 0; + + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers); + + } + + void calculateMacroscopicParameters(IBlock * block, const cell_idx_t ghost_layers) + { + + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers); + + } + + + + void calculateMacroscopicParametersCellInterval(IBlock * block, const CellInterval & ci) + { + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci); + + } + + void calculateMacroscopicParametersInner(IBlock * block) + { + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + CellInterval inner = density->xyzSize(); + inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2])); + + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, inner); + } + + void calculateMacroscopicParametersOuter(IBlock * block) + { + + auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID); + auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID); + auto density = block->getData< field::GhostLayerField<double, 1> >(densityID); + + + + + if( layers_.empty() ) + { + CellInterval ci; + + density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false); + ci.expand(Cell(0, 0, -outerWidth_[2])); + layers_.push_back(ci); + + density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false); + ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2])); + layers_.push_back(ci); + } + + + for( auto & ci: layers_ ) + { + calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci); + } + + + + } + + + + + private: + shared_ptr< StructuredBlockStorage > blocks_; + BlockDataID pdfsID; + BlockDataID densityID; + BlockDataID velocityID; + double omega_; + + private: std::set< field::GhostLayerField<double, 27> *, field::SwapableCompare< field::GhostLayerField<double, 27> * > > cache_pdfs_; + + Cell outerWidth_; + std::vector<CellInterval> layers_; + + +}; + + +} // namespace lbm +} // namespace walberla + + +#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG ) +# pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py b/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py new file mode 100644 index 0000000000000000000000000000000000000000..bdc208608f08bf202d361b5c369d48199c5c5ed4 --- /dev/null +++ b/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py @@ -0,0 +1,48 @@ +import sympy as sp + +from pystencils import Target +from pystencils import fields + +from lbmpy.creationfunctions import create_lb_collision_rule +from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil +from pystencils_walberla import ManualCodeGenerationContext, generate_info_header +from lbmpy_walberla import generate_lbm_sweep_collection + + +with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False, + mpi=True, double_accuracy=True, cuda=False) as ctx: + + for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]: + target = Target.GPU if ctx.cuda else Target.CPU + data_type = "float64" if ctx.double_accuracy else "float32" + openmp = True if ctx.openmp else False + if ctx.optimize_for_localhost: + cpu_vec = {"nontemporal": False, "assume_aligned": True} + else: + cpu_vec = None + + method = Method.SRT + relaxation_rate = sp.symbols("omega") + streaming_pattern = 'pull' + + pdfs = fields(f"pdfs({stencil.Q}): {data_type}[{stencil.D}D]", layout='fzyx') + density_field, velocity_field = fields(f"density(1), velocity({stencil.D}): {data_type}[{stencil.D}D]", + layout='fzyx') + + macroscopic_fields = {'density': density_field, 'velocity': velocity_field} + + lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate, + streaming_pattern=streaming_pattern) + lbm_opt = LBMOptimisation(cse_global=False) + + collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) + + generate_lbm_sweep_collection(ctx, f'{stencil.name}{method.name}', collision_rule, + streaming_pattern='pull', + field_layout='zyxf', + refinement_scaling=None, + macroscopic_fields=macroscopic_fields, + target=target, data_type=data_type, + cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) + + ctx.write_all_files() diff --git a/src/stencil/Directions.h b/src/stencil/Directions.h index d3a75b812131878c4222d4cf4fa5ec5953e1f087..5be0d72223712c7cc8f4aa2b991bb9456f09aadb 100644 --- a/src/stencil/Directions.h +++ b/src/stencil/Directions.h @@ -9,9 +9,10 @@ #pragma once // core includes -#include "core/DataTypes.h" #include "core/cell/Cell.h" +#include "core/DataTypes.h" #include "core/debug/Debug.h" +#include "core/math/Vector3.h" // STL includes #include <string> @@ -135,6 +136,39 @@ namespace stencil { } }; + + /// Maps a (x,y,z) direction vector to its direction \ingroup stencil + inline Direction vectorToDirection(cell_idx_t x, cell_idx_t y, cell_idx_t z){ + static const Direction vecToDirArr[3][3][3] = { + { // x = -1 + {BSW, SW, TSW}, // y = -1 + {BW, W, TW}, // y = 0 + {BNW, NW, TNW} // y = 1 + }, + { // x = 0 + {BS, S, TS}, // y = -1 + {B, C, T}, // y = 0 + {BN, N, TN} // y = 1 + }, + { // x = 1 + {BSE, SE, TSE}, // y = -1 + {BE, E, TE}, // y = 0 + {BNE, NE, TNE} // y = 1 + } + }; + + return vecToDirArr[x + 1][y + 1][z + 1]; + } + + inline Direction vectorToDirection(Vector3< cell_idx_t > vec){ + return vectorToDirection(vec[0], vec[1], vec[2]); + } + + inline bool isFaceDirection(Direction dir) { return 1 <= dir && dir <= 6; } + inline bool isEdgeDirection(Direction dir) { return 7 <= dir && dir <= 18; } + inline bool isCornerDirection(Direction dir) { return 19 <= dir; } + + /// The x,y,z component for each normalized direction \ingroup stencil const real_t cNorm[3][NR_OF_DIRECTIONS] = { { diff --git a/src/timeloop/SweepTimeloop.cpp b/src/timeloop/SweepTimeloop.cpp index 6064efa27af1dce8a8b435132f961325759aa8a1..5721c51c79a57aa19b684776f8f70545a5a6d0bc 100644 --- a/src/timeloop/SweepTimeloop.cpp +++ b/src/timeloop/SweepTimeloop.cpp @@ -52,11 +52,11 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors) if( s.sweep.empty() ) { WALBERLA_ABORT("Selecting Sweep " << sweepIt->first << ": " << - "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" ); + "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" ) } // ensure that exactly one sweep has been registered that matches the specified selectors - size_t numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState()); + size_t const numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState()); if (numSweeps == size_t(0)) { continue; @@ -73,7 +73,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors) { std::string sweepName; s.sweep.getUnique( selectors + bi->getState(), sweepName ); - WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() ); + WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() ) } (selectedSweep->function_)( bi.get() ); @@ -121,11 +121,11 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) if( s.sweep.empty() ) { WALBERLA_ABORT("Selecting Sweep " << sweepIt->first << ": " << - "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" ); + "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" ) } // ensure that exactly one sweep has been registered that matches the specified selectors - size_t numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState()); + size_t const numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState()); if (numSweeps == size_t(0)) { continue; @@ -139,7 +139,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) std::string sweepName; Sweep * selectedSweep = s.sweep.getUnique( selectors + bi->getState(), sweepName ); - WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() ); + WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() ) // loop over all blocks timing[sweepName].start(); diff --git a/src/timeloop/Timeloop.cpp b/src/timeloop/Timeloop.cpp index 6b2f548d54ec9922200488243eec2355e3a9f676..fd46e16c1a6e6cb646761e56f45231244c2083e9 100644 --- a/src/timeloop/Timeloop.cpp +++ b/src/timeloop/Timeloop.cpp @@ -40,7 +40,7 @@ Timeloop::Timeloop( uint_t nrOfTimeSteps) void Timeloop::run( const bool logTimeStep ) { - WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ); + WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ) while(curTimeStep_ < nrOfTimeSteps_) { singleStep( logTimeStep ); if ( stop_ ) { @@ -48,12 +48,12 @@ void Timeloop::run( const bool logTimeStep ) break; } } - WALBERLA_LOG_PROGRESS( "Timeloop finished" ); + WALBERLA_LOG_PROGRESS( "Timeloop finished" ) } void Timeloop::run( WcTimingPool & tp, const bool logTimeStep ) { - WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ); + WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" ) while(curTimeStep_ < nrOfTimeSteps_) { singleStep( tp, logTimeStep ); @@ -63,7 +63,7 @@ void Timeloop::run( WcTimingPool & tp, const bool logTimeStep ) } } - WALBERLA_LOG_PROGRESS( "Timeloop finished" ); + WALBERLA_LOG_PROGRESS( "Timeloop finished" ) } //******************************************************************************************************************* @@ -97,9 +97,9 @@ void Timeloop::synchronizedStop( bool stopVal ) void Timeloop::singleStep( const bool logTimeStep ) { - LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep ); + LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep ); - WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ ); + WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ ) for(size_t i=0; i<beforeFunctions_.size(); ++i ) executeSelectable( beforeFunctions_[i], uid::globalState(), "Pre-Timestep Function" ); @@ -114,9 +114,9 @@ void Timeloop::singleStep( const bool logTimeStep ) void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep ) { - LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep ); + LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep ); - WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ ); + WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ ) for(size_t i=0; i<beforeFunctions_.size(); ++i ) executeSelectable( beforeFunctions_[i], uid::globalState(), "Pre-Timestep Function", tp ); @@ -147,7 +147,7 @@ void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h, const VoidFctNoArguments& f, const std::string & id, const Set<SUID>&r, const Set<SUID> & e ) { - WALBERLA_ASSERT_LESS( h, beforeFunctions_.size() ); //invalid FctHandle + WALBERLA_ASSERT_LESS( h, beforeFunctions_.size() ) //invalid FctHandle beforeFunctions_[h].add(f,r,e,id); } @@ -166,7 +166,7 @@ void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h, const VoidFctNoArguments& f, const std::string & id, const Set<SUID>&r, const Set<SUID> & e ) { - WALBERLA_ASSERT_LESS( h, afterFunctions_.size() ); //invalid FctHandle + WALBERLA_ASSERT_LESS( h, afterFunctions_.size() ) //invalid FctHandle afterFunctions_[h].add(f,r,e,id); } @@ -182,10 +182,10 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN if( exe == nullptr ) WALBERLA_ABORT( "Trying to selecting " << what << ": " << "Multiple Matches found! Check your selector " << selector << std::endl - << "All registered objects: " << std::endl << selectable << std::endl ); + << "All registered objects: " << std::endl << selectable << std::endl ) - WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" ); + WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" ) LIKWID_MARKER_START( objectName.c_str() ); (*exe)(); @@ -203,9 +203,9 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN if( !exe) WALBERLA_ABORT( "Trying to select " << what << ": " << "Multiple or no matches found! Check your selector " << selector << std::endl - << "All registered objects: " << std::endl << selectable << std::endl ); + << "All registered objects: " << std::endl << selectable << std::endl ) - WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" ); + WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" ) timing[objectName].start(); LIKWID_MARKER_START( objectName.c_str() ); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 94efcd3ae7e6607d6528c0bf2bd50a884cda4810..b16438de039b01b03a062b79d3589642491416b5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,6 +14,7 @@ add_subdirectory( gather ) add_subdirectory( geometry ) add_subdirectory( gui ) add_subdirectory( lbm ) +add_subdirectory( lbm_generated ) add_subdirectory( lbm_mesapd_coupling ) add_subdirectory( mesa_pd ) add_subdirectory( mesh ) diff --git a/tests/core/FunctionTraitsTest.cpp b/tests/core/FunctionTraitsTest.cpp index 8c378eceaa7f16bf08a8400f31a35ffa028f6a9d..dc503f2db3df10f911fe2df14d7accbe0a800524 100644 --- a/tests/core/FunctionTraitsTest.cpp +++ b/tests/core/FunctionTraitsTest.cpp @@ -25,7 +25,7 @@ using namespace walberla; -// FunctionTraits are used in a similar way in cuda/Kernel.h. As explained below, special attention is required. +// FunctionTraits are used in a similar way in gpu/Kernel.h. As explained below, special attention is required. template< typename F> struct SomeClass { diff --git a/tests/field/CMakeLists.txt b/tests/field/CMakeLists.txt index 7251f35e4886df752f61e94f12b5a38e7c327bf8..b48f4ac79d1a778ccd1cadad910b6fab00a99b36 100644 --- a/tests/field/CMakeLists.txt +++ b/tests/field/CMakeLists.txt @@ -71,6 +71,11 @@ waLBerla_generate_target_from_python(NAME CodegenJacobiCPUGeneratedJacobiKernel waLBerla_compile_test( FILES codegen/CodegenJacobiCPU.cpp DEPENDS gui timeloop CodegenJacobiCPUGeneratedJacobiKernel) waLBerla_execute_test( NAME CodegenJacobiCPU ) +waLBerla_generate_target_from_python(NAME SweepCollectionKernel FILE codegen/SweepCollection.py + OUT_FILES SweepCollection.h SweepCollection.cpp) +waLBerla_compile_test( FILES codegen/SweepCollection.cpp DEPENDS timeloop SweepCollectionKernel) +waLBerla_execute_test( NAME SweepCollection ) + waLBerla_generate_target_from_python(NAME CodegenPoissonCPUGeneratedKernel FILE codegen/Poisson.py OUT_FILES Poisson.cpp Poisson.h ) waLBerla_compile_test( FILES codegen/CodegenPoissonCPU.cpp DEPENDS gui timeloop CodegenPoissonCPUGeneratedKernel) diff --git a/tests/field/codegen/CodegenJacobiCPU.cpp b/tests/field/codegen/CodegenJacobiCPU.cpp index 3bba9623ed02f18521431ac492f3b2c4d2a584d3..6755c687a9ff0496e02c0b776dcc90595b151090 100644 --- a/tests/field/codegen/CodegenJacobiCPU.cpp +++ b/tests/field/codegen/CodegenJacobiCPU.cpp @@ -84,7 +84,7 @@ void testJacobi2D() auto firstBlock = blocks->begin(); auto f = firstBlock->getData<ScalarField>( fieldID ); - WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 4.0)); + WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 4.0)) } @@ -132,7 +132,7 @@ void testJacobi3D() auto firstBlock = blocks->begin(); auto f = firstBlock->getData<ScalarField>( fieldID ); - WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 8.0)); + WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 8.0)) } diff --git a/tests/field/codegen/SweepCollection.cpp b/tests/field/codegen/SweepCollection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..33c8d2be099b9146d6738a5d7809023dbb7fa3b4 --- /dev/null +++ b/tests/field/codegen/SweepCollection.cpp @@ -0,0 +1,89 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file SweepCollection.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== +#include "blockforest/Initialization.h" + +#include "core/Environment.h" +#include "core/debug/TestSubsystem.h" + +#include "field/AddToStorage.h" +#include "field/communication/PackInfo.h" + +#include "timeloop/SweepTimeloop.h" +#include "SweepCollection.h" + +using namespace walberla; + +typedef GhostLayerField<real_t, 1> ScalarField; +using SweepCollection_T = pystencils::SweepCollection; + +void testSweepCollection() +{ + uint_t xSize = 20; + uint_t ySize = 20; + uint_t zSize = 20; + // Create blocks + shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid ( + uint_t(1) , uint_t(1), uint_t(1), // number of blocks in x,y,z direction + xSize, ySize, zSize, // how many cells per block (x,y,z) + real_c(1.0), // dx: length of one cell in physical coordinates + false, // one block per process - "false" means all blocks to one process + true, true, true ); // full periodicity + + + const real_t initField1 = real_c(1.0); + const real_t initField2 = real_c(0.0); + const real_t initField3 = real_c(0.0); + const real_t a = real_c(2.0); + + const BlockDataID field1ID = field::addToStorage<ScalarField>(blocks, "Field1", initField1); + const BlockDataID field2ID = field::addToStorage<ScalarField>(blocks, "Field2", initField2); + const BlockDataID field3ID = field::addToStorage<ScalarField>(blocks, "Field3", initField3); + + SweepCollection_T sweepCollection(blocks, field1ID, field2ID, field3ID, a); + + // Create Timeloop + const uint_t numberOfTimesteps = uint_t(100); + SweepTimeloop timeloop ( blocks, numberOfTimesteps ); + + // Registering the sweep + timeloop.add() << Sweep( sweepCollection.fct1(SweepCollection_T::ALL), "fc1" ); + timeloop.add() << Sweep( sweepCollection.fct2(SweepCollection_T::ALL), "fc2" ); + + timeloop.run(); + + auto firstBlock = blocks->begin(); + auto field1 = firstBlock->getData<ScalarField>( field1ID ); + auto field2 = firstBlock->getData<ScalarField>( field2ID ); + auto field3 = firstBlock->getData<ScalarField>( field3ID ); + + WALBERLA_CHECK_FLOAT_EQUAL(field1->get(0,0,0), initField1) + WALBERLA_CHECK_FLOAT_EQUAL(field2->get(0,0,0), initField1 * real_c(2.0) * a) + WALBERLA_CHECK_FLOAT_EQUAL(field3->get(0,0,0), initField1 * real_c(2.0) * a * real_c(2.0) * a) +} + + +int main( int argc, char ** argv ) +{ + mpi::Environment env( argc, argv ); + debug::enterTestMode(); + + testSweepCollection(); + return EXIT_SUCCESS; +} diff --git a/tests/field/codegen/SweepCollection.py b/tests/field/codegen/SweepCollection.py new file mode 100644 index 0000000000000000000000000000000000000000..1229a2e2ec4594e7f10596b08dd7801f3a0d465a --- /dev/null +++ b/tests/field/codegen/SweepCollection.py @@ -0,0 +1,19 @@ +import sympy as sp + +import pystencils as ps +from pystencils import Assignment +from pystencils_walberla import CodeGeneration, function_generator, generate_sweep_collection + + +with CodeGeneration() as ctx: + field_type = "float64" if ctx.double_accuracy else "float32" + + a = sp.Symbol('a') + f1, f2, f3 = ps.fields(f"f1, f2, f3: {field_type}[3D]", layout='fzyx') + up1 = Assignment(f2.center, 2 * a * f1.center) + up2 = Assignment(f3.center, 2 * a * f2.center) + + fct1 = function_generator(ctx, 'fct1', up1) + fct2 = function_generator(ctx, 'fct2', up2) + + generate_sweep_collection(ctx, "SweepCollection", [fct1, fct2]) diff --git a/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp index 4360b66e97cc65176b79bc215f3b73f099f2160d..55bf49b1b1fb158164d5b9a764fd35ae02defaf5 100644 --- a/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp +++ b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp @@ -28,6 +28,7 @@ #include "core/debug/TestSubsystem.h" #include "core/Environment.h" +#include "gpu/GPUWrapper.h" #include "gpu/FieldCopy.h" #include "gpu/communication/UniformGPUScheme.h" @@ -60,7 +61,7 @@ gpu::GPUField<int> * createSmallGPUField( IBlock * const , StructuredBlockStorag void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) { - gpu::communication::UniformGPUScheme< Stencil_T > us{ sbf }; + gpu::communication::UniformGPUScheme< Stencil_T > us{ sbf, false, false }; us.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId)); for( auto & block : *sbf ) { @@ -97,10 +98,10 @@ void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, } void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) { - gpu::communication::UniformGPUScheme< Stencil_T > us1{ sbf }; + gpu::communication::UniformGPUScheme< Stencil_T > us1{ sbf, false, false }; us1.addPackInfo(std::make_shared< pystencils::ScalarFieldPullReductionGPU >(gpuFieldId)); - gpu::communication::UniformGPUScheme< Stencil_T > us2{ sbf }; + gpu::communication::UniformGPUScheme< Stencil_T > us2{ sbf, false, false }; us2.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId)); for( auto & block : *sbf ) { diff --git a/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp index d70ecf5a35e5b0773ae99f0f7a6a520f8c01b9bd..3e79d6263b5feb28a11cc15d6206ba109cd3df5e 100644 --- a/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp +++ b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp @@ -14,7 +14,7 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUBlockSelectorCommunicationTest.cpp -//! \ingroup cuda +//! \ingroup gpu //! \author Helen Schottenhamml <helen.schottenhamml@fau.de> //! \brief Short communication test for the usage of block selectors in UniformGPUScheme. // @@ -23,24 +23,26 @@ #include <blockforest/GlobalLoadBalancing.h> #include <blockforest/Initialization.h> #include <blockforest/SetupBlockForest.h> + #include <core/DataTypes.h> #include <core/Environment.h> #include <core/debug/TestSubsystem.h> #include <core/math/Random.h> + #include <domain_decomposition/BlockDataID.h> + #include <field/AddToStorage.h> #include <field/GhostLayerField.h> + +#include "gpu/GPUWrapper.h" #include <gpu/AddGPUFieldToStorage.h> #include <gpu/FieldCopy.h> #include <gpu/GPUField.h> #include <gpu/communication/MemcpyPackInfo.h> #include <gpu/communication/UniformGPUScheme.h> + #include <stencil/D3Q27.h> #include <stencil/Directions.h> -#include <stencil/Iterator.h> -#include <vector> - -#include "gpu/GPUWrapper.h" namespace walberla { @@ -53,15 +55,13 @@ using GPUScalarField_T = gpu::GPUField< Type_T >; const Set< SUID > requiredBlockSelector("communication"); const Set< SUID > incompatibleBlockSelector("no communication"); -void suidAssignmentFunction( blockforest::SetupBlockForest & forest ) { - - for( auto & sblock : forest ) { - if( forest.atDomainXMinBorder( sblock ) ) { - sblock.addState(incompatibleBlockSelector); - } else { - sblock.addState(requiredBlockSelector); - } - sblock.setWorkload(walberla::numeric_cast<walberla::workload_t>(1)); +void suidAssignmentFunction(blockforest::SetupBlockForest& forest) +{ + for (auto& sblock : forest) + { + if (forest.atDomainXMinBorder(sblock)) { sblock.addState(incompatibleBlockSelector); } + else { sblock.addState(requiredBlockSelector); } + sblock.setWorkload(walberla::numeric_cast< walberla::workload_t >(1)); } } @@ -70,13 +70,9 @@ void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const Blo for (auto& block : *blocks) { Type_T val; - if (blocks->atDomainXMinBorder(block)) { - val = Type_T(-1); - } else if (blocks->atDomainXMaxBorder(block)) { - val = Type_T(1); - } else { - val = Type_T(0); - } + if (blocks->atDomainXMinBorder(block)) { val = Type_T(-1); } + else if (blocks->atDomainXMaxBorder(block)) { val = Type_T(1); } + else { val = Type_T(0); } auto* field = block.getData< ScalarField_T >(fieldID); WALBERLA_ASSERT_NOT_NULLPTR(field) @@ -90,12 +86,11 @@ void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const Blo } } -std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid ( - const uint_t numberOfXBlocks, const uint_t numberOfYBlocks, const uint_t numberOfZBlocks, - const uint_t numberOfXCellsPerBlock, const uint_t numberOfYCellsPerBlock, const uint_t numberOfZCellsPerBlock, - const real_t dx, - const bool xPeriodic, const bool yPeriodic, const bool zPeriodic, - const bool keepGlobalBlockInformation ) +std::shared_ptr< StructuredBlockForest > + createSelectorBlockGrid(const uint_t numberOfXBlocks, const uint_t numberOfYBlocks, const uint_t numberOfZBlocks, + const uint_t numberOfXCellsPerBlock, const uint_t numberOfYCellsPerBlock, + const uint_t numberOfZCellsPerBlock, const real_t dx, const bool xPeriodic, + const bool yPeriodic, const bool zPeriodic, const bool keepGlobalBlockInformation) { // initialize SetupBlockForest = determine domain decomposition @@ -103,10 +98,12 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid ( sforest.addWorkloadMemorySUIDAssignmentFunction(suidAssignmentFunction); - AABB const domainAABB{ real_c(0), real_c(0), real_c(0), - dx * real_c( numberOfXBlocks * numberOfXCellsPerBlock ), - dx * real_c( numberOfYBlocks * numberOfYCellsPerBlock ), - dx * real_c( numberOfZBlocks * numberOfZCellsPerBlock ) }; + AABB const domainAABB{ real_c(0), + real_c(0), + real_c(0), + dx * real_c(numberOfXBlocks * numberOfXCellsPerBlock), + dx * real_c(numberOfYBlocks * numberOfYCellsPerBlock), + dx * real_c(numberOfZBlocks * numberOfZCellsPerBlock) }; sforest.init(domainAABB, numberOfXBlocks, numberOfYBlocks, numberOfZBlocks, xPeriodic, yPeriodic, zPeriodic); // calculate process distribution @@ -115,8 +112,8 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid ( blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > const metisConfig( true, false, - std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, numberOfXCellsPerBlock, - numberOfYCellsPerBlock, numberOfZCellsPerBlock)); + std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, + numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock)); sforest.calculateProcessDistribution_Default(uint_c(MPIManager::instance()->numProcesses()), memoryLimit, "hilbert", 10, false, metisConfig); @@ -140,15 +137,16 @@ int main(int argc, char** argv) debug::enterTestMode(); walberla::Environment const walberlaEnv(argc, argv); - const Vector3<uint_t> nBlocks { 3, 1, 1 }; - const Vector3<uint_t> cells { 2, 2, 1 }; - Vector3<real_t> domainSize; - for( uint_t d = 0; d < 3; ++d ) { + const Vector3< uint_t > nBlocks{ 3, 1, 1 }; + const Vector3< uint_t > cells{ 2, 2, 1 }; + Vector3< real_t > domainSize; + for (uint_t d = 0; d < 3; ++d) + { domainSize[d] = real_c(cells[d] * nBlocks[d]); } - auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2], - cells[0], cells[1], cells[2], 1, false, true, true, true); + auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2], cells[0], cells[1], cells[2], 1, false, + true, true, true); BlockDataID const fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1)); initScalarField(blocks, fieldID); @@ -161,17 +159,20 @@ int main(int argc, char** argv) // Perform one communication step communication(); + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Copy to CPU gpu::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID ); // Check for correct data in ghost layers of middle block - auto middleBlock = blocks->getBlock( domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2) ); - auto cpuField = middleBlock->getData<ScalarField_T>(fieldID); + auto middleBlock = blocks->getBlock(domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2)); + auto cpuField = middleBlock->getData< ScalarField_T >(fieldID); WALBERLA_ASSERT_NOT_NULLPTR(cpuField) - + // avoid unused variable warning in release mode (void) cpuField; + // WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(cpuField, WALBERLA_LOG_DEVEL_VAR(cpuField->get(x, y, z))) // check for missing communication with left neighbour (first block, incompatible selector) WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 0, 0), 0, "Communication with left neighbor detected.") diff --git a/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp index 66a4d3c74da29b7779783132a3d8f3cce5a08287..f0e41c1081e306cad53a5d0e3f04187acbb18b95 100644 --- a/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp +++ b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp @@ -14,7 +14,7 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUFieldPackInfoTest.cpp -//! \ingroup cuda +//! \ingroup gpu //! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple //! streams. @@ -37,7 +37,6 @@ #include "stencil/Directions.h" #include "stencil/Iterator.h" -#include <cuda_runtime.h> #include <vector> #include "gpu/ErrorChecking.h" @@ -134,7 +133,7 @@ int main(int argc, char** argv) CommSchemeType syncCommScheme(blocks); syncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(syncGPUFieldId)); - // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams + // Setup communication scheme for asynchronous GPUPackInfo, which uses GPU streams CommSchemeType asyncCommScheme(blocks); asyncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(asyncGPUFieldId)); diff --git a/tests/gpu/communication/GPUPackInfoTest.cpp b/tests/gpu/communication/GPUPackInfoTest.cpp index fec15a605a230c59f96abc3e31e8160992000338..e0a9d87fd06f7d261b09942c7d69bed189e60177 100644 --- a/tests/gpu/communication/GPUPackInfoTest.cpp +++ b/tests/gpu/communication/GPUPackInfoTest.cpp @@ -14,7 +14,7 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUFieldPackInfoTest.cpp -//! \ingroup cuda +//! \ingroup gpu //! \author Paulo Carvalho <prcjunior@inf.ufpr.br> //! \brief Tests if a GPUField is correctly packed into buffers // diff --git a/tests/lbm/diff_packinfos.sh b/tests/lbm/diff_packinfos.sh index bfa89c5ef63477c61fefac60b7767fe22aaf4233..074d31492dbc1dd2cc0f47bc059ab5d181117f22 100755 --- a/tests/lbm/diff_packinfos.sh +++ b/tests/lbm/diff_packinfos.sh @@ -2,5 +2,5 @@ REGEX='^((#include)|(void)|(uint_t))' cd default_codegen -diff -u -B <(grep -vP "$REGEX" FromKernelPackInfoPull.cpp) <(grep -vP "$REGEX" AccessorBasedPackInfoEven.cpp) || exit 1 -diff -u -B <(grep -vP "$REGEX" FromKernelPackInfoPush.cpp) <(grep -vP "$REGEX" AccessorBasedPackInfoOdd.cpp) || exit 1 +diff -u -B <(tail -n +20 FromKernelPackInfoPull.cpp | grep -vP "$REGEX") <(tail -n +20 AccessorBasedPackInfoEven.cpp | grep -vP "$REGEX") || exit 1 +diff -u -B <(tail -n +20 FromKernelPackInfoPush.cpp | grep -vP "$REGEX") <(tail -n +20 AccessorBasedPackInfoOdd.cpp | grep -vP "$REGEX") || exit 1 diff --git a/tests/lbm_generated/CMakeLists.txt b/tests/lbm_generated/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7a7ef76bd6b9a605d19cd7e2bbdf4bedddbed7b --- /dev/null +++ b/tests/lbm_generated/CMakeLists.txt @@ -0,0 +1,21 @@ +############################################################################################################################# +# +# Tests for generated lbm module +# +############################################################################################################################# +waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.py" ) + +waLBerla_generate_target_from_python(NAME ExampleGenerated + FILE Example.py + OUT_FILES LBMStorageSpecification.h LBMStorageSpecification.cpp + LBMSweepCollection.h LBMSweepCollection.cpp + NoSlip.h NoSlip.cpp + UBB.h UBB.cpp + LBMBoundaryCollection.h + Example_InfoHeader.h) +waLBerla_compile_test( FILES Example.cpp DEPENDS ExampleGenerated blockforest field lbm_generated timeloop ) + +if( WALBERLA_DOUBLE_ACCURACY ) +waLBerla_compile_test( FILES LDC.cpp DEPENDS blockforest field lbm_generated timeloop ) +endif() diff --git a/tests/lbm_generated/Example.cpp b/tests/lbm_generated/Example.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4dfd69b553d88d268efb0c49c857eb391f6277ea --- /dev/null +++ b/tests/lbm_generated/Example.cpp @@ -0,0 +1,233 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Example.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/all.h" + +#include "core/all.h" + +#include "domain_decomposition/all.h" + +#include "field/all.h" + +#include "geometry/all.h" + +#include "timeloop/all.h" + +#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h" +#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/refinement/BasicRecursiveTimeStep.h" + +// include the generated header file. It includes all generated classes +#include "Example_InfoHeader.h" + +using namespace walberla; +using namespace std::placeholders; + +using StorageSpecification_T = lbm::LBMStorageSpecification; +using Stencil_T = StorageSpecification_T::Stencil; +using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil; +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; +using PackInfo_T = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >; + +using SweepCollection_T = lbm::LBMSweepCollection; + +using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >; +using ScalarField_T = GhostLayerField< real_t, 1 >; + +using flag_t = walberla::uint8_t; +using FlagField_T = FlagField< flag_t >; +using BoundaryCollection_T = lbm::LBMBoundaryCollection< FlagField_T >; + +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; + +class LDCRefinement +{ + public: + LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) + { + std::vector< SetupBlock* > blocks; + forest.getBlocks(blocks); + + for (auto b : blocks) + { + if (forest.atDomainZMaxBorder(*b)) + { + if (b->getLevel() < refinementDepth_) { b->setMarker(true); } + } + } + } + + private: + const uint_t refinementDepth_; +}; + +class LDC +{ + public: + LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); } + + RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + Block& b = dynamic_cast< Block& >(*bIt); + uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } + + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; +}; + +static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& setup) +{ + Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize"); + Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks"); + Vector3< bool > periodic = domainSetup.getParameter< Vector3< bool > >("periodic"); + + auto refSelection = setup.refinementSelector(); + setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection)); + AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); + setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), uint_c(MPIManager::instance()->numProcesses())); +} + +int main(int argc, char** argv) +{ + walberla::Environment walberlaEnv(argc, argv); + mpi::MPIManager::instance()->useWorldComm(); + + // read parameters + auto domainSetup = walberlaEnv.config()->getOneBlock("DomainSetup"); + auto parameters = walberlaEnv.config()->getOneBlock("Parameters"); + + auto omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + auto timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1); + auto refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1)); + + auto remainingTimeLoggerFrequency = + parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds + + auto flowSetup = std::make_shared< LDC >(refinementDepth); + + SetupBlockForest setupBfs; + WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") + createSetupBlockForest(setupBfs, domainSetup, *flowSetup); + // domainSetup + + // Create structured block forest + Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); + + WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...") + auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); + blocks->createCellBoundingBoxes(); + + WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); } + + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks()) + for (uint_t level = 0; level <= refinementDepth; level++) + { + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level)) + } + + StorageSpecification_T StorageSpec = StorageSpecification_T(); + BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(2)); + BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx); + + BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(3)); + + SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega); + for (auto& block : *blocks) + { + sweepCollection.initialise(&block); + } + + const FlagUID fluidFlagUID("Fluid"); + flowSetup->setupBoundaryFlagField(*blocks, flagFieldId); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID, 2); + BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID); + + WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...") + auto comm = + std::make_shared< blockforest::communication::NonUniformBufferedScheme< CommunicationStencil_T > >(blocks); + auto packInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldId); + comm->addPackInfo(packInfo); + + lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > timestep( + blocks, pdfFieldId, sweepCollection, boundaryCollection, comm, packInfo); + + SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps); + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "ExampleVTK", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + + auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity"); + vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks) + { + sweepCollection.calculateMacroscopicParameters(&block); + } + }); + + vtkOutput->addCellDataWriter(velWriter); + timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + timeloop.addFuncAfterTimeStep(timestep); + + // log remaining time + timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), + "remaining time logger"); + + WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps") + + timeloop.run(); + + return EXIT_SUCCESS; +} diff --git a/tests/lbm_generated/Example.prm b/tests/lbm_generated/Example.prm new file mode 100644 index 0000000000000000000000000000000000000000..1957b362e4b77a94fb3d3c68b6e9d33b0efb3e6f --- /dev/null +++ b/tests/lbm_generated/Example.prm @@ -0,0 +1,30 @@ + +Parameters +{ + omega 1.95; + timesteps 3000; + refinementDepth 1; + + remainingTimeLoggerFrequency 3; // in seconds + vtkWriteFrequency 500; +} + +DomainSetup +{ + domainSize <64, 64, 64>; + rootBlocks <4, 4, 4>; + + cellsPerBlock < 16, 16, 16 >; + periodic < 0, 1, 0 >; +} + +Boundaries +{ + + Border { direction W; walldistance -1; flag NoSlip; } + Border { direction E; walldistance -1; flag NoSlip; } + Border { direction S; walldistance -1; flag NoSlip; } + Border { direction N; walldistance -1; flag UBB; } + Border { direction T; walldistance -1; flag NoSlip; } + Border { direction B; walldistance -1; flag NoSlip; } +} diff --git a/tests/lbm_generated/Example.py b/tests/lbm_generated/Example.py new file mode 100644 index 0000000000000000000000000000000000000000..5233639be24c6574cee6440300bfe73e22e5e2ae --- /dev/null +++ b/tests/lbm_generated/Example.py @@ -0,0 +1,48 @@ +import sympy as sp + +from pystencils import Target +from pystencils import fields + +from lbmpy.advanced_streaming.utility import get_timesteps +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule +from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil +from pystencils_walberla import CodeGeneration, generate_info_header +from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator + +import warnings + +warnings.filterwarnings("ignore") +with CodeGeneration() as ctx: + target = Target.CPU # Target.GPU if ctx.cuda else Target.CPU + data_type = "float64" if ctx.double_accuracy else "float32" + + streaming_pattern = 'esotwist' + timesteps = get_timesteps(streaming_pattern) + + omega = sp.symbols("omega") + + stencil = LBStencil(Stencil.D3Q19) + pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]", layout='fzyx') + + macroscopic_fields = {'velocity': vel_field} + + lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, + streaming_pattern=streaming_pattern) + lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx') + + method = create_lb_method(lbm_config=lbm_config) + collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) + + no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', + boundary_object=NoSlip()) + ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', + boundary_object=UBB([0.05, 0, 0], data_type=data_type)) + + generate_lbm_package(ctx, name="LBM", + collision_rule=collision_rule, + lbm_config=lbm_config, lbm_optimisation=lbm_opt, + nonuniform=True, boundaries=[no_slip, ubb], + macroscopic_fields=macroscopic_fields) + + generate_info_header(ctx, 'Example_InfoHeader') diff --git a/tests/lbm_generated/LDC.cpp b/tests/lbm_generated/LDC.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6df6d45a3e1cf2915e3b077a83ee77c40668fff7 --- /dev/null +++ b/tests/lbm_generated/LDC.cpp @@ -0,0 +1,136 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file LDC.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/all.h" +#include "blockforest/communication/UniformBufferedScheme.h" + +#include "core/all.h" + +#include "domain_decomposition/all.h" + +#include "field/all.h" +#include "geometry/all.h" +#include "timeloop/all.h" + +#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" + +#include "lbm_generated/storage_specification/D3Q19StorageSpecification.h" +#include "lbm_generated/sweep_collection/D3Q19SRT.h" +#include "lbm_generated/boundary/D3Q19BoundaryCollection.h" + + +using namespace walberla; +using namespace std::placeholders; + +using StorageSpecification_T = lbm::D3Q19StorageSpecification; +using Stencil_T = StorageSpecification_T::Stencil; +using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil; +using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; + +using SweepCollection_T = lbm::D3Q19SRT; + +using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >; +using ScalarField_T = GhostLayerField< real_t, 1 >; + +using flag_t = walberla::uint8_t; +using FlagField_T = FlagField< flag_t >; +using BoundaryCollection_T = lbm::D3Q19BoundaryCollection< FlagField_T >; + +using blockforest::communication::UniformBufferedScheme; + +int main(int argc, char** argv) +{ + walberla::Environment walberlaEnv(argc, argv); + mpi::MPIManager::instance()->useWorldComm(); + + // read parameters + auto parameters = walberlaEnv.config()->getOneBlock("Parameters"); + + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1); + + const double remainingTimeLoggerFrequency = + parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds + + auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config()); + + StorageSpecification_T const StorageSpec = StorageSpecification_T(); + BlockDataID const pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(1), field::fzyx); + BlockDataID const velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx); + BlockDataID const densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx); + BlockDataID const flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(1)); + + const FlagUID fluidFlagUID("Fluid"); + + auto boundariesConfig = walberlaEnv.config()->getBlock("Boundaries"); + geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID); + + BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, real_c(1.0), real_c(0.05), real_c(0.0), real_c(0.0)); + SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega); + + for (auto& block : *blocks) + { + sweepCollection.initialise(&block); + } + + auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId); + UniformBufferedScheme< Stencil_T > communication(blocks); + communication.addPackInfo(packInfo); + + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + + timeLoop.add() << BeforeFunction(communication, "communication") + << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions"); + timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); + // + auto vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "ExampleVTK", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + + auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity"); + auto densWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldId, "density"); + vtkOutput->addBeforeFunction([&](){ + for (auto& block : *blocks) + { + sweepCollection.calculateMacroscopicParameters(&block); + } + }); + + vtkOutput->addCellDataWriter(velWriter); + vtkOutput->addCellDataWriter(densWriter); + + timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + // log remaining time + timeLoop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), + "remaining time logger"); + + WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps") + + timeLoop.run(); + + return EXIT_SUCCESS; +} diff --git a/tests/lbm_generated/LDC.prm b/tests/lbm_generated/LDC.prm new file mode 100644 index 0000000000000000000000000000000000000000..4ba435d1b027eee3f9a066e9a9e39aa5e1ec831f --- /dev/null +++ b/tests/lbm_generated/LDC.prm @@ -0,0 +1,28 @@ + +Parameters +{ + omega 1.95; + timesteps 3000; + + remainingTimeLoggerFrequency 3; // in seconds + vtkWriteFrequency 500; +} + +DomainSetup +{ + Blocks <4, 4, 4>; + cellsPerBlock < 32, 32, 32 >; + + periodic < 0, 1, 0 >; +} + +Boundaries +{ + + Border { direction W; walldistance -1; flag NoSlip; } + Border { direction E; walldistance -1; flag NoSlip; } + Border { direction S; walldistance -1; flag NoSlip; } + Border { direction N; walldistance -1; flag UBB; } + Border { direction T; walldistance -1; flag NoSlip; } + Border { direction B; walldistance -1; flag NoSlip; } +}