From 1b2194cd569b7acdddd7679af1d8b8be3989c59c Mon Sep 17 00:00:00 2001
From: Christoph Rettinger <christoph.rettinger@fau.de>
Date: Fri, 19 Jun 2020 11:11:25 +0200
Subject: [PATCH] Fixed vectorization depening on field layout, added test case
 to verify it

---
 .../lbmpy_walberla/walberla_lbm_generation.py |  10 +-
 python/pystencils_walberla/__init__.py        |   5 +-
 python/pystencils_walberla/codegen.py         |  19 +--
 tests/lbm/CMakeLists.txt                      |  10 ++
 .../FieldLayoutAndVectorizationTest.cpp       | 127 ++++++++++++++++++
 .../FieldLayoutAndVectorizationTest.py        |  24 ++++
 6 files changed, 181 insertions(+), 14 deletions(-)
 create mode 100644 tests/lbm/codegen/FieldLayoutAndVectorizationTest.cpp
 create mode 100644 tests/lbm/codegen/FieldLayoutAndVectorizationTest.py

diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py
index e3523dc1f..28a73ea73 100644
--- a/python/lbmpy_walberla/walberla_lbm_generation.py
+++ b/python/lbmpy_walberla/walberla_lbm_generation.py
@@ -122,7 +122,7 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
     generation_context.write_file("{}.cpp".format(class_name), source)
 
 
-def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='fzyx', refinement_scaling=None,
+def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='zyxf', refinement_scaling=None,
                            **create_kernel_params):
 
     # usually a numpy layout is chosen by default i.e. xyzf - which is bad for waLBerla where at least the spatial
@@ -134,13 +134,15 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
     q = len(lb_method.stencil)
     dim = lb_method.dim
 
+    if field_layout == 'fzyx':
+        create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] = True
+    elif field_layout == 'zyxf':
+        create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] = False
+
     create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
     if create_kernel_params['target'] == 'gpu':
         raise ValueError("Lattice Models can only be generated for CPUs. To generate LBM on GPUs use sweeps directly")
 
-    if field_layout != 'fzyx' and create_kernel_params['cpu_vectorize_info']['instruction_set'] != None:
-        raise ValueError("Vectorization is not available for given field layout, use fzyx instead")
-
     src_field = ps.Field.create_generic('pdfs', dim, dtype, index_dimensions=1, layout=field_layout, index_shape=(q,))
     dst_field = ps.Field.create_generic('pdfs_tmp', dim, dtype, index_dimensions=1, layout=field_layout, index_shape=(q,))
 
diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py
index 88eb440a6..b6c310169 100644
--- a/python/pystencils_walberla/__init__.py
+++ b/python/pystencils_walberla/__init__.py
@@ -2,8 +2,9 @@ from .boundary import generate_staggered_boundary, generate_staggered_flux_bound
 from .cmake_integration import CodeGeneration
 from .codegen import (
     generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel,
-    generate_mpidtype_info_from_kernel, generate_sweep)
+    generate_mpidtype_info_from_kernel, generate_sweep, get_vectorize_instruction_set)
 
 __all__ = ['CodeGeneration',
            'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info',
-           'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary']
+           'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary',
+           'get_vectorize_instruction_set']
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index fb665277b..692377ffb 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -13,7 +13,8 @@ from pystencils.stencil import inverse_direction, offset_to_direction_string
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
 
 __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel',
-           'generate_mpidtype_info_from_kernel', 'default_create_kernel_parameters', 'KernelInfo']
+           'generate_mpidtype_info_from_kernel', 'default_create_kernel_parameters', 'KernelInfo',
+           'get_vectorize_instruction_set']
 
 
 def generate_sweep(generation_context, class_name, assignments,
@@ -322,26 +323,28 @@ class KernelInfo:
         self.parameters = ast.get_parameters()  # cache parameters here
 
 
-def default_create_kernel_parameters(generation_context, params):
-    default_dtype = "float64" if generation_context.double_accuracy else 'float32'
-
+def get_vectorize_instruction_set(generation_context):
     if generation_context.optimize_for_localhost:
         supported_instruction_sets = get_supported_instruction_sets()
         if supported_instruction_sets:
-            default_vec_is = get_supported_instruction_sets()[-1]
+            return get_supported_instruction_sets()[-1]
         else:  # if cpuinfo package is not installed
-            default_vec_is = 'sse'
+            return 'sse'
     else:
-        default_vec_is = None
+        return None
+
+def default_create_kernel_parameters(generation_context, params):
+    default_dtype = "float64" if generation_context.double_accuracy else 'float32'
 
     params['target'] = params.get('target', 'cpu')
     params['data_type'] = params.get('data_type', default_dtype)
     params['cpu_openmp'] = params.get('cpu_openmp', generation_context.openmp)
     params['cpu_vectorize_info'] = params.get('cpu_vectorize_info', {})
 
+    default_vec_is = get_vectorize_instruction_set(generation_context)
     vec = params['cpu_vectorize_info']
     vec['instruction_set'] = vec.get('instruction_set', default_vec_is)
-    vec['assume_inner_stride_one'] = True
+    vec['assume_inner_stride_one'] = vec.get('assume_inner_stride_one', True)
     vec['assume_aligned'] = vec.get('assume_aligned', False)
     vec['nontemporal'] = vec.get('nontemporal', False)
     return params
diff --git a/tests/lbm/CMakeLists.txt b/tests/lbm/CMakeLists.txt
index 4d732d1cf..221d6ff73 100644
--- a/tests/lbm/CMakeLists.txt
+++ b/tests/lbm/CMakeLists.txt
@@ -72,13 +72,23 @@ waLBerla_execute_test( NAME  SuViscoelasticityTest COMMAND $<TARGET_FILE:SuVisco
 
 # Code Generation
 if( WALBERLA_BUILD_WITH_CODEGEN )
+
 waLBerla_generate_target_from_python(NAME LbCodeGenerationExampleGenerated
       FILE codegen/LbCodeGenerationExample.py
       OUT_FILES LbCodeGenerationExample_LatticeModel.cpp LbCodeGenerationExample_LatticeModel.h
       LbCodeGenerationExample_NoSlip.cpp LbCodeGenerationExample_NoSlip.h
       LbCodeGenerationExample_UBB.cpp LbCodeGenerationExample_UBB.h )
 waLBerla_compile_test( FILES codegen/LbCodeGenerationExample.cpp DEPENDS LbCodeGenerationExampleGenerated)
+
 waLBerla_generate_target_from_python(NAME FluctuatingMRTGenerated FILE codegen/FluctuatingMRT.py
                               OUT_FILES FluctuatingMRT_LatticeModel.cpp FluctuatingMRT_LatticeModel.h )
 waLBerla_compile_test( FILES codegen/FluctuatingMRT.cpp DEPENDS FluctuatingMRTGenerated)
+
+waLBerla_generate_target_from_python(NAME FieldLayoutAndVectorizationTestGenerated FILE codegen/FieldLayoutAndVectorizationTest.py
+                                     OUT_FILES FieldLayoutAndVectorizationTest_FZYX_Vec_LatticeModel.cpp FieldLayoutAndVectorizationTest_FZYX_Vec_LatticeModel.h
+                                               FieldLayoutAndVectorizationTest_FZYX_NoVec_LatticeModel.cpp FieldLayoutAndVectorizationTest_FZYX_NoVec_LatticeModel.h
+                                               FieldLayoutAndVectorizationTest_ZYXF_Vec_LatticeModel.cpp FieldLayoutAndVectorizationTest_ZYXF_Vec_LatticeModel.h
+                                               FieldLayoutAndVectorizationTest_ZYXF_NoVec_LatticeModel.cpp FieldLayoutAndVectorizationTest_ZYXF_NoVec_LatticeModel.h )
+waLBerla_compile_test( FILES codegen/FieldLayoutAndVectorizationTest.cpp DEPENDS FieldLayoutAndVectorizationTestGenerated)
+
 endif()
\ No newline at end of file
diff --git a/tests/lbm/codegen/FieldLayoutAndVectorizationTest.cpp b/tests/lbm/codegen/FieldLayoutAndVectorizationTest.cpp
new file mode 100644
index 000000000..09ff983b2
--- /dev/null
+++ b/tests/lbm/codegen/FieldLayoutAndVectorizationTest.cpp
@@ -0,0 +1,127 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldLayoutAndVectorizationTest.cpp
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+#include "core/logging/all.h"
+
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+
+#include "FieldLayoutAndVectorizationTest_FZYX_Vec_LatticeModel.h"
+#include "FieldLayoutAndVectorizationTest_FZYX_NoVec_LatticeModel.h"
+#include "FieldLayoutAndVectorizationTest_ZYXF_Vec_LatticeModel.h"
+#include "FieldLayoutAndVectorizationTest_ZYXF_NoVec_LatticeModel.h"
+
+namespace field_layout_vectorization_test {
+
+using namespace walberla;
+
+typedef walberla::uint8_t flag_t;
+typedef FlagField<flag_t> FlagField_T;
+
+template<typename LM1_T, typename LM2_T>
+void checkEquivalence(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfField1ID, BlockDataID pdfField2ID, std::string identifier)
+{
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto pdfField1 = blockIt->getData<lbm::PdfField<LM1_T> >(pdfField1ID);
+      auto pdfField2 = blockIt->getData<lbm::PdfField<LM2_T> >(pdfField2ID);
+
+      WALBERLA_FOR_ALL_CELLS_XYZ(pdfField1,
+            for( uint_t f = 0; f < pdfField1->fSize(); ++f)
+            {
+               WALBERLA_CHECK_FLOAT_EQUAL(pdfField1->get(x,y,z,f), pdfField2->get(x,y,z,f),
+                                          identifier << " - mismatch in " << x << " " << y << " " << z << " " << f);
+            }
+            )
+
+   }
+
+}
+
+int main(int argc, char **argv) {
+
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   walberla::Environment walberlaEnv(argc, argv);
+
+   auto blocks = blockforest::createUniformBlockGrid( 1, 1, 1,
+                                                      32, 32, 32, real_t(1),
+                                                      0, false, false,
+                                                      true, true, true, //periodicity
+                                                      false );
+
+
+   real_t omega = real_t(1.6);
+   Vector3<real_t> initialVel(real_t(0.1), real_t(0), real_t(0));
+
+   using LatticeModel1_T = lbm::FieldLayoutAndVectorizationTest_FZYX_Vec_LatticeModel;
+   LatticeModel1_T latticeModel1(omega);
+   BlockDataID pdfField1ID = lbm::addPdfFieldToStorage(blocks, "pdf field1", latticeModel1, initialVel, real_t(1), field::fzyx);
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto sweep = LatticeModel1_T::Sweep(pdfField1ID);
+      sweep(&(*blockIt));
+   }
+
+   using LatticeModel2_T = lbm::FieldLayoutAndVectorizationTest_FZYX_NoVec_LatticeModel;
+   LatticeModel2_T latticeModel2(omega);
+   BlockDataID pdfField2ID = lbm::addPdfFieldToStorage(blocks, "pdf field2", latticeModel2, initialVel, real_t(1), field::fzyx);
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto sweep = LatticeModel2_T::Sweep(pdfField2ID);
+      sweep(&(*blockIt));
+   }
+
+   checkEquivalence<LatticeModel1_T,LatticeModel2_T>(blocks, pdfField1ID, pdfField2ID, "1 vs 2");
+
+   using LatticeModel3_T = lbm::FieldLayoutAndVectorizationTest_ZYXF_Vec_LatticeModel;
+   LatticeModel3_T latticeModel3(omega);
+   BlockDataID pdfField3ID = lbm::addPdfFieldToStorage(blocks, "pdf field3", latticeModel3, initialVel, real_t(1), field::zyxf);
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto sweep = LatticeModel3_T::Sweep(pdfField3ID);
+      sweep(&(*blockIt));
+   }
+
+   checkEquivalence<LatticeModel2_T,LatticeModel3_T>(blocks, pdfField2ID, pdfField3ID,  "2 vs 3");
+
+   using LatticeModel4_T = lbm::FieldLayoutAndVectorizationTest_ZYXF_NoVec_LatticeModel;
+   LatticeModel4_T latticeModel4(omega);
+   BlockDataID pdfField4ID = lbm::addPdfFieldToStorage(blocks, "pdf field4", latticeModel4, initialVel, real_t(1), field::zyxf);
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto sweep = LatticeModel4_T::Sweep(pdfField4ID);
+      sweep(&(*blockIt));
+   }
+
+   checkEquivalence<LatticeModel3_T,LatticeModel4_T>(blocks, pdfField3ID, pdfField4ID,  "3 vs 4");
+
+   return EXIT_SUCCESS;
+}
+
+} //namespace field_layout_vectorization_test
+
+int main( int argc, char **argv ){
+   field_layout_vectorization_test::main(argc, argv);
+}
diff --git a/tests/lbm/codegen/FieldLayoutAndVectorizationTest.py b/tests/lbm/codegen/FieldLayoutAndVectorizationTest.py
new file mode 100644
index 000000000..1bd424e79
--- /dev/null
+++ b/tests/lbm/codegen/FieldLayoutAndVectorizationTest.py
@@ -0,0 +1,24 @@
+import sympy as sp
+from lbmpy.creationfunctions import create_lb_collision_rule
+from pystencils_walberla import CodeGeneration
+from lbmpy_walberla import generate_lattice_model
+from pystencils_walberla import get_vectorize_instruction_set
+
+from collections import namedtuple
+
+with CodeGeneration() as ctx:
+    omega_shear = sp.symbols("omega")
+    collision_rule = create_lb_collision_rule( stencil='D2Q9', compressible=False, method='srt' )
+
+    SetupDefinition = namedtuple('SetupDefinition', ['name', 'field_layout', 'vectorization_dict'])
+
+    default_vectorize_instruction_set = get_vectorize_instruction_set(ctx)
+
+    configurations = [SetupDefinition('FZYX_Vec',   'fzyx', {'instruction_set': default_vectorize_instruction_set} ),
+                      SetupDefinition('FZYX_NoVec', 'fzyx', {'instruction_set': None} ),
+                      SetupDefinition('ZYXF_Vec',   'zyxf', {'instruction_set': default_vectorize_instruction_set} ), # does/should not vectorize, but instead yield warning
+                      SetupDefinition('ZYXF_NoVec', 'zyxf', {'instruction_set': None} )]
+
+    for conf in configurations:
+        generate_lattice_model(ctx, 'FieldLayoutAndVectorizationTest_'+conf.name+'_LatticeModel', collision_rule,
+                               field_layout=conf.field_layout, refinement_scaling=None, cpu_vectorize_info=conf.vectorization_dict)
-- 
GitLab