From bcd2d628663a756186ca4cfa6bab68731d597610 Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Thu, 1 Apr 2021 22:41:17 +0200
Subject: [PATCH] Allow fields to be aligned to cacheline size

---
 pystencils/alignedarray.py                    | 12 +++++++--
 pystencils/backends/cbackend.py               |  4 +--
 pystencils/backends/simd_instruction_sets.py  | 25 +++++++++++++++++++
 pystencils_tests/test_vectorization.py        |  6 +----
 .../test_vectorization_specific.py            | 17 +++++++++++--
 5 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py
index 70271b0c0..eda9fcaeb 100644
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
@@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
         shape: size of the array
         byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
                         By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
+                        When 'cacheline' is specified, the size of a cache line is used.
         dtype: numpy data type
         byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
                     typically used to align first inner cell instead of ghost layer
         order: storage linearization order
         align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
     """
-    if byte_alignment is True:
-        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets,
+    if byte_alignment is True or byte_alignment == 'cacheline':
+        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
                                                                get_vector_instruction_set)
 
         type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
         instruction_sets = get_supported_instruction_sets()
         if instruction_sets is None:
             byte_alignment = 64
+        elif byte_alignment == 'cacheline':
+            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
+            if all([s is None for s in cacheline_sizes]):
+                byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+                                      for is_name in instruction_sets])
+            else:
+                byte_alignment = max([s for s in cacheline_sizes if s is not None])
         else:
             byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
                                   for is_name in instruction_sets])
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index a924b67fd..9ff44bbef 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -317,7 +317,7 @@ class CBackend:
         if self._vector_instruction_set:
             align = self._vector_instruction_set['bytes']
         else:
-            align = node.symbol.dtype.base_type.numpy_dtype.type(0).nbytes
+            align = node.symbol.dtype.base_type.numpy_dtype.itemsize
 
         np_dtype = node.symbol.dtype.base_type.numpy_dtype
         required_size = np_dtype.itemsize * node.size + align
@@ -341,7 +341,7 @@ class CBackend:
         if self._vector_instruction_set:
             align = self._vector_instruction_set['bytes']
         else:
-            align = node.symbol.dtype.base_type.numpy_dtype.type(0).nbytes
+            align = node.symbol.dtype.base_type.numpy_dtype.itemsize
 
         code = "#if defined(_MSC_VER)\n"
         code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 850f8ff6d..9469dc59e 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
 
 
 _cache = None
+_cachelinesize = None
 
 
 def get_supported_instruction_sets():
@@ -56,3 +57,27 @@ def get_supported_instruction_sets():
     if flags.issuperset(required_neon_flags):
         result.append("neon")
     return result
+
+
+def get_cacheline_size(instruction_set):
+    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
+       Usually, this is identical to the cache line size."""
+    global _cachelinesize
+    
+    instruction_sets = get_vector_instruction_set('double', instruction_set)
+    if 'cachelineSize' not in instruction_sets:
+        return None
+    if _cachelinesize is not None:
+        return _cachelinesize
+    
+    import pystencils as ps
+    import numpy as np
+    
+    arr = np.zeros((1, 1), dtype=np.float32)
+    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)]
+    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
+    kernel = ast.compile()
+    kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0})
+    _cachelinesize = int(arr[0, 0])
+    return _cachelinesize
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 59c85a425..783b9bb34 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -38,12 +38,8 @@ def test_aligned_and_nt_stores(openmp=False):
     # create a datahandling object
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
 
-    if openmp:
-        alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True
-    else:
-        alignment = True
-
     # fields
+    alignment = 'cacheline' if openmp else True
     g = dh.add_array("g", values_per_cell=1, alignment=alignment)
     dh.fill("g", 1.0, ghost_layers=True)
     f = dh.add_array("f", values_per_cell=1, alignment=alignment)
diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py
index 4476e5bf4..fca50949e 100644
--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -4,7 +4,8 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
+from pystencils.backends.simd_instruction_sets import (get_cacheline_size, get_supported_instruction_sets,
+                                                       get_vector_instruction_set)
 from pystencils.data_types import cast_func, VectorType
 
 supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else []
@@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
         with pytest.raises(ValueError):
             dh.run_kernel(kernel)
     else:
-        dh.run_kernel(kernel)
\ No newline at end of file
+        dh.run_kernel(kernel)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+def test_cacheline_size(instruction_set):
+    cacheline_size = get_cacheline_size(instruction_set)
+    if cacheline_size is None:
+        pytest.skip()
+    instruction_set = get_vector_instruction_set('double', instruction_set)
+    vector_size = instruction_set['bytes']
+    assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible"
+    assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size"
+    assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2"
-- 
GitLab