From c331d24fe78b19f7865ce4eb62c4efdbe0d52239 Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Wed, 21 Apr 2021 19:50:27 +0200
Subject: [PATCH] maskStore improvements

- fix the aligned version
- make sure the test case is incommensurate with the vector width
- implement a fallback for instruction sets that don't support it natively
---
 pystencils/backends/cbackend.py             |  6 +++++-
 pystencils/backends/x86_instruction_sets.py |  2 +-
 pystencils_tests/test_conditional_vec.py    | 12 ++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 988a8e518..670588468 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -261,7 +261,11 @@ class CBackend:
                 if aligned:
                     instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                 if mask != True:  # NOQA
-                    instr = 'maskStore' if aligned else 'maskStoreU'
+                    instr = 'maskStoreA' if aligned else 'maskStoreU'
+                    if instr not in self._vector_instruction_set:
+                        self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set['load' + instr[-1]].format('{0}'), '{1}', '{2}'))
                     printed_mask = self.sympy_printer.doprint(mask)
                     if data_type.base_type.base_name == 'double':
                         if self._vector_instruction_set['double'] == '__m256d':
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 50005c5ae..5cf049415 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -57,7 +57,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
         'storeU': 'storeu[0,1]',
         'storeA': 'store[0,1]',
         'stream': 'stream[0,1]',
-        'maskStore': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
+        'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
         'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
     }
 
diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py
index 1274aa674..959e20c2b 100644
--- a/pystencils_tests/test_conditional_vec.py
+++ b/pystencils_tests/test_conditional_vec.py
@@ -80,10 +80,8 @@ def test_boolean_before_loop():
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float', 'double'))
 def test_vec_maskstore(instruction_set, dtype):
-    if instruction_set in ['neon', 'vsx']:
-        pytest.skip('no mask-store instructions available')
     data_arr = np.zeros((16, 16), dtype=np.float64 if dtype == 'double' else np.float32)
-    data_arr[4:-4, 4:-4] = 1.0
+    data_arr[3:-3, 3:-3] = 1.0
     data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
 
     c = [
@@ -93,8 +91,10 @@ def test_vec_maskstore(instruction_set, dtype):
     ]
     ast = ps.create_kernel(c, target='cpu',
                            cpu_vectorize_info={'instruction_set': instruction_set})
-    ps.show_code(ast)
     kernel = ast.compile()
     kernel(data=data_arr)
-    np.testing.assert_equal(data_arr[0:4, :], 2.0)
-    np.testing.assert_equal(data_arr[4:-4, 4:-4], 1.0)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
-- 
GitLab