From 686a3ad8300540ccd41b219ea9fa14445406402b Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Tue, 20 Apr 2021 20:06:30 +0200
Subject: [PATCH] Vectorization tests: run with all available instruction sets,
 add test for maskStore

---
 pystencils/backends/cbackend.py               | 12 +++++++--
 pystencils/backends/x86_instruction_sets.py   | 18 ++-----------
 pystencils_tests/test_conditional_vec.py      | 23 ++++++++++++++++
 pystencils_tests/test_vectorization.py        | 26 +++++++++----------
 .../test_vectorization_specific.py            | 14 +++++++---
 5 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 5aabd83d6..988a8e518 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -263,8 +263,16 @@ class CBackend:
                 if mask != True:  # NOQA
                     instr = 'maskStore' if aligned else 'maskStoreU'
                     printed_mask = self.sympy_printer.doprint(mask)
-                    if self._vector_instruction_set['dataTypePrefix']['double'] == '__mm256d':
-                        printed_mask = f"_mm256_castpd_si256({printed_mask})"
+                    if data_type.base_type.base_name == 'double':
+                        if self._vector_instruction_set['double'] == '__m256d':
+                            printed_mask = f"_mm256_castpd_si256({printed_mask})"
+                        elif self._vector_instruction_set['double'] == '__m128d':
+                            printed_mask = f"_mm_castpd_si128({printed_mask})"
+                    elif data_type.base_type.base_name == 'float':
+                        if self._vector_instruction_set['float'] == '__m256':
+                            printed_mask = f"_mm256_castps_si256({printed_mask})"
+                        elif self._vector_instruction_set['float'] == '__m128':
+                            printed_mask = f"_mm_castps_si128({printed_mask})"
 
                 rhs_type = get_type_of_expression(node.rhs)
                 if type(rhs_type) is not VectorType:
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 78515e1a2..50005c5ae 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -57,23 +57,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
         'storeU': 'storeu[0,1]',
         'storeA': 'store[0,1]',
         'stream': 'stream[0,1]',
-        'maskstore': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
-        'maskload': 'mask_load[0, 2, 1]' if instruction_set == 'avx512' else 'maskload[0, 2, 1]'
+        'maskStore': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
+        'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
     }
-    if instruction_set == 'avx512':
-        base_names.update({
-            'maskStore': 'mask_store[0, 2, 1]',
-            'maskStoreU': 'mask_storeu[0, 2, 1]',
-            'maskLoad': 'mask_load[2, 1, 0]',
-            'maskLoadU': 'mask_loadu[2, 1, 0]'
-        })
-    if instruction_set == 'avx':
-        base_names.update({
-            'maskStore': 'maskstore[0, 2, 1]',
-            'maskStoreU': 'maskstore[0, 2, 1]',
-            'maskLoad': 'maskload[0, 1]',
-            'maskLoadU': 'maskloadu[0, 1]'
-        })
 
     for comparison_op, constant in comparisons.items():
         base_names[comparison_op] = f'cmp[0, 1, {constant}]'
diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py
index 59f6367cd..1274aa674 100644
--- a/pystencils_tests/test_conditional_vec.py
+++ b/pystencils_tests/test_conditional_vec.py
@@ -75,3 +75,26 @@ def test_boolean_before_loop():
     np.testing.assert_array_equal(g_arr, 1.0)
     kernel(f=f_arr, g=g_arr, t2=-1.0)
     np.testing.assert_array_equal(g_arr, 42.0)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float', 'double'))
+def test_vec_maskstore(instruction_set, dtype):
+    if instruction_set in ['neon', 'vsx']:
+        pytest.skip('no mask-store instructions available')
+    data_arr = np.zeros((16, 16), dtype=np.float64 if dtype == 'double' else np.float32)
+    data_arr[4:-4, 4:-4] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
+
+    c = [
+        Conditional(data.center() < 1.0, Block([
+            ps.Assignment(data.center(), 2.0)
+        ]))
+    ]
+    ast = ps.create_kernel(c, target='cpu',
+                           cpu_vectorize_info={'instruction_set': instruction_set})
+    ps.show_code(ast)
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[0:4, :], 2.0)
+    np.testing.assert_equal(data_arr[4:-4, 4:-4], 1.0)
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index f668c6b81..b7ee2e83b 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -14,7 +14,7 @@ else:
     instruction_set = None
 
 
-def test_vector_type_propagation():
+def test_vector_type_propagation(instruction_set=instruction_set):
     a, b, c, d, e = sp.symbols("a b c d e")
     arr = np.ones((2 ** 2 + 2, 2 ** 3 + 2))
     arr *= 10.0
@@ -33,7 +33,7 @@ def test_vector_type_propagation():
     np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
 
 
-def test_aligned_and_nt_stores(openmp=False):
+def test_aligned_and_nt_stores(instruction_set=instruction_set, openmp=False):
     domain_size = (24, 24)
     # create a datahandling object
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
@@ -63,11 +63,11 @@ def test_aligned_and_nt_stores(openmp=False):
     dh.run_kernel(kernel)
     np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size))
 
-def test_aligned_and_nt_stores_openmp():
-    test_aligned_and_nt_stores(True)
+def test_aligned_and_nt_stores_openmp(instruction_set=instruction_set):
+    test_aligned_and_nt_stores(instruction_set, True)
 
 
-def test_inplace_update():
+def test_inplace_update(instruction_set=instruction_set):
     shape = (9, 9, 3)
     arr = np.ones(shape, order='f')
 
@@ -88,7 +88,7 @@ def test_inplace_update():
     np.testing.assert_equal(arr, 2)
 
 
-def test_vectorization_fixed_size():
+def test_vectorization_fixed_size(instruction_set=instruction_set):
     configurations = []
     # Fixed size - multiple of four
     arr = np.ones((20 + 2, 24 + 2)) * 5.0
@@ -115,7 +115,7 @@ def test_vectorization_fixed_size():
         np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)
 
 
-def test_vectorization_variable_size():
+def test_vectorization_variable_size(instruction_set=instruction_set):
     f, g = ps.fields("f, g : double[2D]")
     update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)]
     ast = ps.create_kernel(update_rule)
@@ -131,7 +131,7 @@ def test_vectorization_variable_size():
     np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)
 
 
-def test_piecewise1():
+def test_piecewise1(instruction_set=instruction_set):
     a, b, c, d, e = sp.symbols("a b c d e")
     arr = np.ones((2 ** 3 + 2, 2 ** 4 + 2)) * 5.0
 
@@ -149,7 +149,7 @@ def test_piecewise1():
     np.testing.assert_equal(dst[1:-1, 1:-1], 5 + 3 + 5.0)
 
 
-def test_piecewise2():
+def test_piecewise2(instruction_set=instruction_set):
     arr = np.zeros((20, 20))
 
     @ps.kernel
@@ -167,7 +167,7 @@ def test_piecewise2():
     np.testing.assert_equal(arr, np.ones_like(arr))
 
 
-def test_piecewise3():
+def test_piecewise3(instruction_set=instruction_set):
     arr = np.zeros((22, 22))
 
     @ps.kernel
@@ -181,7 +181,7 @@ def test_piecewise3():
     ast.compile()
 
 
-def test_logical_operators():
+def test_logical_operators(instruction_set=instruction_set):
     arr = np.zeros((22, 22))
 
     @ps.kernel
@@ -220,7 +220,7 @@ def test_hardware_query():
            any([iset.startswith('sve') for iset in supported_instruction_sets])
 
 
-def test_vectorised_pow():
+def test_vectorised_pow(instruction_set=instruction_set):
     arr = np.zeros((24, 24))
     f, g = ps.fields(f=arr, g=arr)
 
@@ -256,7 +256,7 @@ def test_vectorised_pow():
     ast.compile()
 
 
-def test_vectorised_fast_approximations():
+def test_vectorised_fast_approximations(instruction_set=instruction_set):
     arr = np.zeros((24, 24))
     f, g = ps.fields(f=arr, g=arr)
 
diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py
index fca50949e..df0b9d943 100644
--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -57,15 +57,13 @@ def test_vectorized_abs(instruction_set, dtype):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('gl_field, gl_kernel', [(1, 0), (0, 1), (1, 1)])
 def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set, dtype):
-    itemsize = 8 if dtype == 'double' else 4
-    alignment = get_vector_instruction_set(dtype, instruction_set)['width'] * itemsize
     dtype = np.float64 if dtype == 'double' else np.float32
 
     domain_size = (128, 128)
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), default_target='cpu')
-    src = dh.add_array("src", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=alignment)
+    src = dh.add_array("src", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=True)
     dh.fill(src.name, 1.0, ghost_layers=True)
-    dst = dh.add_array("dst", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=alignment)
+    dst = dh.add_array("dst", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=True)
     dh.fill(dst.name, 1.0, ghost_layers=True)
 
     update_rule = ps.Assignment(dst[0, 0], src[0, 0])
@@ -90,3 +88,11 @@ def test_cacheline_size(instruction_set):
     assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible"
     assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size"
     assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2"
+
+
+# test_vectorization is not parametrized because it is supposed to run without pytest, so we parametrize it here
+from pystencils_tests import test_vectorization
+@pytest.mark.parametrize('instruction_set', set(supported_instruction_sets) - set([test_vectorization.instruction_set]))
+@pytest.mark.parametrize('function', [f for f in test_vectorization.__dict__ if f.startswith('test_') and f != 'test_hardware_query'])
+def test_vectorization_other(instruction_set, function):
+    test_vectorization.__dict__[function](instruction_set)
-- 
GitLab