Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 1210 additions and 263 deletions
# distutils: language=c # cython: language_level=3str
# Workaround for cython bug
# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
WORKAROUND = "Something"
import cython import cython
...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel ...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & fluid_mask: if flag_field[x, y] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if flag_field[x + dx, y + dy] & boundary_mask: if flag_field[x + dx, y + dy] & boundary_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy;
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers): for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & fluid_mask: if flag_field[x, y, z] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if flag_field[x + dx, y + dy, z + dz] & boundary_mask: if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx * sum_x + dy * sum_y + dz * sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field, ...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & boundary_mask: if flag_field[x, y] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if 0 <= x + dx < xs and 0 <= y + dy < ys: if 0 <= x + dx < xs and 0 <= y + dy < ys:
if flag_field[x + dx, y + dy] & fluid_mask: if flag_field[x + dx, y + dy] & fluid_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
for z in range(0, zs): for z in range(0, zs):
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & boundary_mask: if flag_field[x, y, z] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs: if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
if flag_field[x + dx, y + dy, z + dz] & fluid_mask: if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx=0
if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx*sum_x + dy*sum_y + dz*sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
\ No newline at end of file
import sympy as sp import sympy as sp
from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
from pystencils.data_types import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.field import Field from pystencils.field import Field
from pystencils.integer_functions import bitwise_and from pystencils.integer_functions import bitwise_and
......
import os
from collections.abc import Hashable
from functools import partial, wraps
from itertools import chain
from functools import lru_cache as memorycache
from joblib import Memory
from appdirs import user_cache_dir
if 'PYSTENCILS_CACHE_DIR' in os.environ:
cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
else:
cache_dir = user_cache_dir('pystencils')
disk_cache = Memory(cache_dir, verbose=False).cache
disk_cache_no_fallback = disk_cache
def _wrapper(wrapped_func, cached_func, *args, **kwargs):
if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
return cached_func(*args, **kwargs)
else:
return wrapped_func(*args, **kwargs)
def memorycache_if_hashable(maxsize=128, typed=False):
def wrapper(func):
return partial(_wrapper, func, memorycache(maxsize, typed)(func))
return wrapper
def sharedmethodcache(cache_id: str):
"""Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
This decorator caches results of instance methods per instantiated object of the surrounding class.
It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the
`self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
and must return the same result when called with the same arguments."""
def _decorator(user_method):
@wraps(user_method)
def _decorated_func(self, *args, **kwargs):
objdict = self.__dict__
cache = objdict.setdefault(cache_id, dict())
key = args
for item in kwargs.items():
key += item
if key not in cache:
result = user_method(self, *args, **kwargs)
cache[key] = result
return result
else:
return cache[key]
return _decorated_func
return _decorator
def clear_cache():
"""
Clears the pystencils cache created by joblib.
"""
memory = Memory(cache_dir, verbose=0)
memory.clear(warn=False)
# Disable memory cache:
# disk_cache = lambda o: o
# disk_cache_no_fallback = lambda o: o
from copy import copy
from collections import defaultdict
from dataclasses import dataclass, field
from types import MappingProxyType
from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
from pystencils import Target, Backend, Field
from pystencils.typing.typed_sympy import BasicType
from pystencils.typing.utilities import collate_types
import numpy as np
# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
# from numpy.typing import DTypeLike
# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
# Proposition: CreateKernelConfigs Classes for different targets?
@dataclass
class CreateKernelConfig:
"""
**Below all parameters for the CreateKernelConfig are explained**
"""
target: Target = Target.CPU
"""
All targets are defined in :class:`pystencils.enums.Target`
"""
backend: Backend = None
"""
All backends are defined in :class:`pystencils.enums.Backend`
"""
function_name: str = 'kernel'
"""
Name of the generated function - only important if generated code is written out
"""
data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
"""
Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value
will then be specified via type collation then.
"""
default_number_float: Union[type, str, BasicType] = None
"""
Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
If data_type is given as a defaultdict its default_factory is used.
"""
default_number_int: Union[type, str, BasicType] = np.int64
"""
Data type used for all untyped integer numbers (i.e. 1)
"""
iteration_slice: Tuple = None
"""
Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
"""
ghost_layers: Union[bool, int, List[Tuple[int]]] = None
"""
A single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
If left to default, the number of ghost layers is determined automatically from the assignments.
"""
cpu_openmp: Union[bool, int] = False
"""
`True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
of available threads will be chosen.
"""
cpu_vectorize_info: Dict = None
"""
A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
"""
cpu_blocking: Tuple[int] = None
"""
A tuple of block sizes or `None` if no blocking should be applied
"""
omp_single_loop: bool = True
"""
If OpenMP is active: whether multiple outer loops are permitted
"""
base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
"""
Specification of how many and which intermediate pointers are created for a field access.
For example [ (0), (2,3,)] creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
zero directly in the field access. These specifications are defined dependent on the loop ordering.
This function translates more readable version into the specification above.
For more information see: `pystencils.transformations.create_intermediate_base_pointer`
"""
gpu_indexing: str = 'block'
"""
Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
"""
gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
"""
Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
"""
# TODO Markus rework this docstring
default_assignment_simplifications: bool = False
"""
If `True` default simplifications are first performed on the Assignments. If problems occur during the
simplification a warning will be thrown.
Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
on the level of the `pystencils.AssignmentCollection`. In this part,
`pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
apply optimisations like insertion of constants to
remove pressure from the registers. Thus the first part of the optimisations can only be executed if
an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
"""
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
"""
List of extra optimizations to perform first on the AST.
"""
use_auto_for_assignments: bool = False
"""
If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
"""
index_fields: List[Field] = None
"""
List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
instead of `create_domain_kernel` is used.
"""
coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
"""
Name of the coordinate fields in the struct data type.
"""
allow_double_writes: bool = False
"""
If True, don't check if every field is only written at a single location. This is required
for example for kernels that are compiled with loop step sizes > 1, that handle multiple
cells at once. Use with care!
"""
skip_independence_check: bool = False
"""
By default the assignment list is checked for read/write independence. This means fields are only written at
locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
"""
class DataTypeFactory:
"""Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
def __init__(self, dt):
self.dt = dt
def __call__(self):
return BasicType(self.dt)
def _check_type(self, dtype_to_check):
if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
self._typing_error()
if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
# NumPy-types are also of type 'type'. However, they have more properties
self._typing_error()
@staticmethod
def _typing_error():
raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
"types are ambiguous. For example float will map to double. "
"Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
def __post_init__(self):
# ---- Legacy parameters
if not isinstance(self.target, Target):
raise ValueError("target must be provided by the 'Target' enum")
# ---- Auto Backend
if not self.backend:
if self.target == Target.CPU:
self.backend = Backend.C
elif self.target == Target.GPU:
self.backend = Backend.CUDA
else:
raise NotImplementedError(f'Target {self.target} has no default backend')
if not isinstance(self.backend, Backend):
raise ValueError("backend must be provided by the 'Backend' enum")
# Normalise data types
for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
self._check_type(dtype)
if not isinstance(self.data_type, dict):
dt = copy(self.data_type) # The copy is necessary because BasicType has sympy shinanigans
self.data_type = defaultdict(self.DataTypeFactory(dt))
if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
for dtype in self.data_type.values():
self._check_type(dtype)
dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
dtype_dict = self.data_type
self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
for dtype in self.data_type.values():
self._check_type(dtype)
self._check_type(self.data_type.default_factory())
if self.default_number_float is None:
self.default_number_float = self.data_type.default_factory()
if not isinstance(self.default_number_float, BasicType):
self.default_number_float = BasicType(self.default_number_float)
if not isinstance(self.default_number_int, BasicType):
self.default_number_int = BasicType(self.default_number_int)
from pystencils.cpu.cpujit import make_python_function from pystencils.cpu.cpujit import make_python_function
from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function'] __all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
...@@ -13,7 +13,7 @@ in a configuration file. ...@@ -13,7 +13,7 @@ in a configuration file.
3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
``%HOMEPATH%\.pystencils\config.json`` (Windows) ``%HOMEPATH%\.pystencils\config.json`` (Windows)
If no configuration file is found, a default configuration is created at the above mentioned location in your home. If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
So run *pystencils* once, then edit the created configuration file. So run *pystencils* once, then edit the created configuration file.
...@@ -23,7 +23,7 @@ Compiler Config (Linux) ...@@ -23,7 +23,7 @@ Compiler Config (Linux)
- **'os'**: should be detected automatically as 'linux' - **'os'**: should be detected automatically as 'linux'
- **'command'**: path to C++ compiler (defaults to 'g++') - **'command'**: path to C++ compiler (defaults to 'g++')
- **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For most Linux compilers the qualifier is ``__restrict__`` For most Linux compilers the qualifier is ``__restrict__``
...@@ -39,30 +39,37 @@ Then 'cl.exe' is used to compile. ...@@ -39,30 +39,37 @@ Then 'cl.exe' is used to compile.
where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat' where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
- **'arch'**: 'x86' or 'x64' - **'arch'**: 'x86' or 'x64'
- **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For Windows compilers the qualifier should be ``__restrict`` For Windows compilers the qualifier should be ``__restrict``
""" """
from appdirs import user_cache_dir, user_config_dir
from collections import OrderedDict
import hashlib import hashlib
import importlib.util
import json import json
import os import os
import platform import platform
import shutil import shutil
import subprocess import subprocess
import sysconfig
import tempfile
import textwrap import textwrap
from collections import OrderedDict import time
from sysconfig import get_paths import warnings
from tempfile import TemporaryDirectory import pathlib
import numpy as np import numpy as np
from appdirs import user_cache_dir, user_config_dir
from pystencils import FieldType from pystencils import FieldType
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import generate_c, get_headers from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import cast_func, VectorType from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from pystencils.cpu.msvc_detection import get_environment
from pystencils.include import get_pystencils_include_path from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
from pystencils.utils import atomic_file_write, recursive_dict_update
def make_python_function(kernel_function_node, custom_backend=None): def make_python_function(kernel_function_node, custom_backend=None):
...@@ -74,6 +81,7 @@ def make_python_function(kernel_function_node, custom_backend=None): ...@@ -74,6 +81,7 @@ def make_python_function(kernel_function_node, custom_backend=None):
- all symbols which are not defined in the kernel itself are expected as parameters - all symbols which are not defined in the kernel itself are expected as parameters
:param kernel_function_node: the abstract syntax tree :param kernel_function_node: the abstract syntax tree
:param custom_backend: use own custom printer for code generation
:return: kernel functor :return: kernel functor
""" """
result = compile_and_load(kernel_function_node, custom_backend) result = compile_and_load(kernel_function_node, custom_backend)
...@@ -116,15 +124,15 @@ def get_configuration_file_path(): ...@@ -116,15 +124,15 @@ def get_configuration_file_path():
# 1) Read path from environment variable if found # 1) Read path from environment variable if found
if 'PYSTENCILS_CONFIG' in os.environ: if 'PYSTENCILS_CONFIG' in os.environ:
return os.environ['PYSTENCILS_CONFIG'], True return os.environ['PYSTENCILS_CONFIG']
# 2) Look in current directory for pystencils.json # 2) Look in current directory for pystencils.json
elif os.path.exists("pystencils.json"): elif os.path.exists("pystencils.json"):
return "pystencils.json", True return "pystencils.json"
# 3) Try ~/.pystencils.json # 3) Try ~/.pystencils.json
elif os.path.exists(config_path_in_home): elif os.path.exists(config_path_in_home):
return config_path_in_home, True return config_path_in_home
else: else:
return config_path_in_home, False return config_path_in_home
def create_folder(path, is_file): def create_folder(path, is_file):
...@@ -136,52 +144,50 @@ def create_folder(path, is_file): ...@@ -136,52 +144,50 @@ def create_folder(path, is_file):
pass pass
def get_llc_command():
"""Try to get executable for llvm's IR compiler llc
We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6
"""
candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6']
found_executables = (e for e in candidates if shutil.which(e))
return next(found_executables, None)
def read_config(): def read_config():
if platform.system().lower() == 'linux': if platform.system().lower() == 'linux':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'linux'), ('os', 'linux'),
('command', 'g++'), ('command', 'g++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine().startswith('ppc64'): if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native') '-mcpu=native')
elif platform.system().lower() == 'windows': elif platform.system().lower() == 'windows':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'windows'), ('os', 'windows'),
('msvc_version', 'latest'), ('msvc_version', 'latest'),
('llc_command', get_llc_command() or 'llc'),
('arch', 'x64'), ('arch', 'x64'),
('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
('restrict_qualifier', '__restrict') ('restrict_qualifier', '__restrict')
]) ])
if platform.machine() == 'ARM64':
default_compiler_config['arch'] = 'ARM64'
default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
elif platform.system().lower() == 'darwin': elif platform.system().lower() == 'darwin':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'darwin'), ('os', 'darwin'),
('command', 'clang++'), ('command', 'clang++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine() == 'arm64': if platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '') if 'sme' in get_supported_instruction_sets():
flag = '-march=armv8.7-a+sme '
else:
flag = ''
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib', for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
'/opt/homebrew/lib/libomp.dylib']: '/opt/homebrew/lib/libomp.dylib']:
if os.path.exists(libomp): if os.path.exists(libomp):
default_compiler_config['flags'] += ' ' + libomp default_compiler_config['flags'] += ' ' + libomp
break break
else:
raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
(platform.system(),))
default_cache_config = OrderedDict([ default_cache_config = OrderedDict([
('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
('clear_cache_on_start', False), ('clear_cache_on_start', False),
...@@ -190,27 +196,47 @@ def read_config(): ...@@ -190,27 +196,47 @@ def read_config():
default_config = OrderedDict([('compiler', default_compiler_config), default_config = OrderedDict([('compiler', default_compiler_config),
('cache', default_cache_config)]) ('cache', default_cache_config)])
config_path, config_exists = get_configuration_file_path() from fasteners import InterProcessLock
config_path = pathlib.Path(get_configuration_file_path())
config_path.parent.mkdir(parents=True, exist_ok=True)
config = default_config.copy() config = default_config.copy()
if config_exists:
with open(config_path, 'r') as json_config_file: lockfile = config_path.with_suffix(config_path.suffix + ".lock")
loaded_config = json.load(json_config_file) with InterProcessLock(lockfile):
config = recursive_dict_update(config, loaded_config) if config_path.exists():
else: with open(config_path, 'r') as json_config_file:
create_folder(config_path, True) loaded_config = json.load(json_config_file)
with open(config_path, 'w') as f: config = recursive_dict_update(config, loaded_config)
json.dump(config, f, indent=4) else:
with open(config_path, 'w') as f:
json.dump(config, f, indent=4)
if config['cache']['object_cache'] is not False: if config['cache']['object_cache'] is not False:
config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid()) config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
if config['cache']['clear_cache_on_start']: clear_cache_on_start = False
cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json')
if os.path.exists(cache_status_file):
# check if compiler config has changed
last_config = json.load(open(cache_status_file, 'r'))
if set(last_config.items()) != set(config['compiler'].items()):
clear_cache_on_start = True
else:
for key in last_config.keys():
if last_config[key] != config['compiler'][key]:
clear_cache_on_start = True
if config['cache']['clear_cache_on_start'] or clear_cache_on_start:
shutil.rmtree(config['cache']['object_cache'], ignore_errors=True) shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
create_folder(config['cache']['object_cache'], False) create_folder(config['cache']['object_cache'], False)
with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
json.dump(config['compiler'], f, indent=4)
os.replace(f.name, cache_status_file)
if config['compiler']['os'] == 'windows': if config['compiler']['os'] == 'windows':
from pystencils.cpu.msvc_detection import get_environment
msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch']) msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
if 'env' not in config['compiler']: if 'env' not in config['compiler']:
config['compiler']['env'] = {} config['compiler']['env'] = {}
...@@ -257,6 +283,7 @@ def clear_cache(): ...@@ -257,6 +283,7 @@ def clear_cache():
create_folder(cache_config['object_cache'], False) create_folder(cache_config['object_cache'], False)
# TODO don't hardcode C type. [1] of tuple output
type_mapping = { type_mapping = {
np.float32: ('PyFloat_AsDouble', 'float'), np.float32: ('PyFloat_AsDouble', 'float'),
np.float64: ('PyFloat_AsDouble', 'double'), np.float64: ('PyFloat_AsDouble', 'double'),
...@@ -266,8 +293,6 @@ type_mapping = { ...@@ -266,8 +293,6 @@ type_mapping = {
np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'), np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'),
np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'), np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'),
np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'), np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
np.complex64: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexFloat'),
np.complex128: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexDouble'),
} }
template_extract_scalar = """ template_extract_scalar = """
...@@ -277,14 +302,6 @@ if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument ' ...@@ -277,14 +302,6 @@ if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '
if( PyErr_Occurred() ) {{ return NULL; }} if( PyErr_Occurred() ) {{ return NULL; }}
""" """
template_extract_complex = """
PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
{target_type} {name}{{ ({real_type}) {extract_function_real}( obj_{name} ),
({real_type}) {extract_function_imag}( obj_{name} ) }};
if( PyErr_Occurred() ) {{ return NULL; }}
"""
template_extract_array = """ template_extract_array = """
PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}"); PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }}; if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
...@@ -352,8 +369,7 @@ def equal_size_check(fields): ...@@ -352,8 +369,7 @@ def equal_size_check(fields):
return "" return ""
ref_field = fields[0] ref_field = fields[0]
cond = ["(buffer_{field.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])".format(ref_field=ref_field, cond = [f"(buffer_{field_to_test.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])"
field=field_to_test, i=i)
for field_to_test in fields[1:] for field_to_test in fields[1:]
for i in range(fields[0].spatial_dimensions)] for i in range(fields[0].spatial_dimensions)]
cond = " && ".join(cond) cond = " && ".join(cond)
...@@ -381,26 +397,37 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -381,26 +397,37 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
aligned = False aligned = False
if ast_node.assignments: if ast_node.assignments:
aligned = any([a.lhs.args[2] for a in ast_node.assignments aligned = any([a.lhs.args[2] for a in ast_node.assignments
if hasattr(a, 'lhs') and isinstance(a.lhs, cast_func) if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)]) and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])
if ast_node.instruction_set and aligned: if ast_node.instruction_set and aligned:
byte_width = ast_node.instruction_set['width'] * item_size byte_width = ast_node.instruction_set['width'] * item_size
if 'cachelineZero' in ast_node.instruction_set:
has_openmp, has_nontemporal = False, False
for loop in ast_node.atoms(LoopOverCoordinate):
has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(VectorMemoryAccess)])
if has_openmp and has_nontemporal:
cl_size = ast_node.instruction_set['cachelineSize']
byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
offset = max(max(ast_node.ghost_layers)) * item_size offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0" offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
message = str(offset) + ". This is probably due to a different number of ghost_layers chosen for " \ message = str(offset) + ". This is probably due to a different number of ghost_layers chosen for " \
"the arrays and the kernel creation. If the number of ghost layers for " \ "the arrays and the kernel creation. If the number of ghost layers for " \
"the kernel creation is not specified it will choose a suitable value " \ "the kernel creation is not specified it will choose a suitable value " \
"automatically. This value might not " \ "automatically. This value might not " \
"be compatible with the allocated arrays." "be compatible with the allocated arrays."
if type(byte_width) is not int:
message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
"cacheline size is required."
pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name, pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
expected=message) expected=message)
if (np_dtype.isbuiltin and FieldType.is_generic(field) if (np_dtype.isbuiltin and FieldType.is_generic(field)
and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)): and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)):
dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name, dtype_cond = f"buffer_{field.name}.format[0] == '{field.dtype.numpy_dtype.char}'"
format=field.dtype.numpy_dtype.char)
pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name, pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name,
expected=str(field.dtype.numpy_dtype)) expected=str(field.dtype.numpy_dtype))
...@@ -429,23 +456,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -429,23 +456,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
elif param.is_field_stride: elif param.is_field_stride:
field = param.fields[0] field = param.fields[0]
item_size = field.dtype.numpy_dtype.itemsize item_size = field.dtype.numpy_dtype.itemsize
parameters.append("buffer_{name}.strides[{i}] / {bytes}".format(bytes=item_size, i=param.symbol.coordinate, parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
name=field.name))
elif param.is_field_shape: elif param.is_field_shape:
parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]") parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
else: else:
extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type] extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
if np.issubdtype(param.symbol.dtype.numpy_dtype, np.complexfloating): pre_call_code += template_extract_scalar.format(extract_function=extract_function,
pre_call_code += template_extract_complex.format(extract_function_real=extract_function[0], target_type=target_type,
extract_function_imag=extract_function[1], name=param.symbol.name)
target_type=target_type,
real_type="float" if target_type == "ComplexFloat"
else "double",
name=param.symbol.name)
else:
pre_call_code += template_extract_scalar.format(extract_function=extract_function,
target_type=target_type,
name=param.symbol.name)
parameters.append(param.symbol.name) parameters.append(param.symbol.name)
...@@ -465,18 +483,15 @@ def create_module_boilerplate_code(module_name, names): ...@@ -465,18 +483,15 @@ def create_module_boilerplate_code(module_name, names):
def load_kernel_from_file(module_name, function_name, path): def load_kernel_from_file(module_name, function_name, path):
from importlib.util import spec_from_file_location, module_from_spec
try: try:
spec = spec_from_file_location(name=module_name, location=path) spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec) mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) spec.loader.exec_module(mod)
except ImportError: except ImportError:
import time warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
import warnings time.sleep(5)
warnings.warn("Could not load " + path + ", trying on more time...") spec = importlib.util.spec_from_file_location(name=module_name, location=path)
time.sleep(1) mod = importlib.util.module_from_spec(spec)
spec = spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec)
spec.loader.exec_module(mod) spec.loader.exec_module(mod)
return getattr(mod, function_name) return getattr(mod, function_name)
...@@ -515,12 +530,19 @@ class ExtensionModuleCode: ...@@ -515,12 +530,19 @@ class ExtensionModuleCode:
headers = {'<math.h>', '<stdint.h>'} headers = {'<math.h>', '<stdint.h>'}
for ast in self._ast_nodes: for ast in self._ast_nodes:
for field in ast.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
# Add the half precision header only if half precision numbers occur in the AST
headers.add('"half_precision.h"')
headers.update(get_headers(ast)) headers.update(get_headers(ast))
header_list = list(headers)
header_list.sort() header_list = sorted(headers)
header_list.insert(0, '"Python.h"') header_list.insert(0, '"Python.h"')
ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers])
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) includes = "\n".join([f"#include {include_file}" for include_file in header_list])
self._code_string += includes self._code_string += includes
self._code_string += "\n" self._code_string += "\n"
self._code_string += f"#define RESTRICT {restrict_qualifier} \n" self._code_string += f"#define RESTRICT {restrict_qualifier} \n"
...@@ -529,12 +551,12 @@ class ExtensionModuleCode: ...@@ -529,12 +551,12 @@ class ExtensionModuleCode:
for ast, name in zip(self._ast_nodes, self._function_names): for ast, name in zip(self._ast_nodes, self._function_names):
old_name = ast.function_name old_name = ast.function_name
ast.function_name = "kernel_" + name ast.function_name = f"kernel_{name}"
self._code_string += generate_c(ast, custom_backend=self._custom_backend) self._code_string += generate_c(ast, custom_backend=self._custom_backend)
self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast) self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast)
ast.function_name = old_name ast.function_name = old_name
self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode()).hexdigest() self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode() + header_hash).hexdigest()
self._code_string += create_module_boilerplate_code(self._code_hash, self._function_names) self._code_string += create_module_boilerplate_code(self._code_hash, self._function_names)
def get_hash_of_code(self): def get_hash_of_code(self):
...@@ -546,9 +568,12 @@ class ExtensionModuleCode: ...@@ -546,9 +568,12 @@ class ExtensionModuleCode:
print(self._code_string, file=file) print(self._code_string, file=file)
def compile_module(code, code_hash, base_dir): def compile_module(code, code_hash, base_dir, compile_flags=None):
if compile_flags is None:
compile_flags = []
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
if compiler_config['os'].lower() == 'windows': if compiler_config['os'].lower() == 'windows':
lib_suffix = '.pyd' lib_suffix = '.pyd'
...@@ -564,8 +589,11 @@ def compile_module(code, code_hash, base_dir): ...@@ -564,8 +589,11 @@ def compile_module(code, code_hash, base_dir):
object_file = os.path.join(base_dir, code_hash + object_suffix) object_file = os.path.join(base_dir, code_hash + object_suffix)
if not os.path.exists(object_file): if not os.path.exists(object_file):
with file_handle_for_atomic_write(src_file) as f: try:
code.write_to_file(f) with open(src_file, 'x') as f:
code.write_to_file(f)
except FileExistsError:
pass
if windows: if windows:
compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split() compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split()
...@@ -579,7 +607,6 @@ def compile_module(code, code_hash, base_dir): ...@@ -579,7 +607,6 @@ def compile_module(code, code_hash, base_dir):
# Linking # Linking
if windows: if windows:
import sysconfig
config_vars = sysconfig.get_config_vars() config_vars = sysconfig.get_config_vars()
py_lib = os.path.join(config_vars["installed_base"], "libs", py_lib = os.path.join(config_vars["installed_base"], "libs",
f"python{config_vars['py_version_nodot']}.lib") f"python{config_vars['py_version_nodot']}.lib")
...@@ -600,7 +627,12 @@ def compile_and_load(ast, custom_backend=None): ...@@ -600,7 +627,12 @@ def compile_and_load(ast, custom_backend=None):
cache_config = get_cache_config() cache_config = get_cache_config()
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else '' if compiler_config['os'].lower() == 'windows':
function_prefix = '__declspec(dllexport)'
elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
function_prefix = ast.instruction_set['function_prefix']
else:
function_prefix = ''
code = ExtensionModuleCode(custom_backend=custom_backend) code = ExtensionModuleCode(custom_backend=custom_backend)
code.add_function(ast, ast.function_name) code.add_function(ast, ast.function_name)
...@@ -608,12 +640,17 @@ def compile_and_load(ast, custom_backend=None): ...@@ -608,12 +640,17 @@ def compile_and_load(ast, custom_backend=None):
code.create_code_string(compiler_config['restrict_qualifier'], function_prefix) code.create_code_string(compiler_config['restrict_qualifier'], function_prefix)
code_hash_str = code.get_hash_of_code() code_hash_str = code.get_hash_of_code()
compile_flags = []
if ast.instruction_set and 'compile_flags' in ast.instruction_set:
compile_flags = ast.instruction_set['compile_flags']
if cache_config['object_cache'] is False: if cache_config['object_cache'] is False:
with TemporaryDirectory() as base_dir: with tempfile.TemporaryDirectory() as base_dir:
lib_file = compile_module(code, code_hash_str, base_dir) lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
else: else:
lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache']) lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache'],
compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
return KernelWrapper(result, ast.get_parameters(), ast) return KernelWrapper(result, ast.get_parameters(), ast)
from typing import List, Union
import sympy as sp import sympy as sp
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.assignment import Assignment from pystencils.config import CreateKernelConfig
from pystencils.enums import Target, Backend
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.cpu.cpujit import make_python_function from pystencils.cpu.cpujit import make_python_function
from pystencils.data_types import BasicType, StructType, TypedSymbol, create_type from pystencils.typing import StructType, TypedSymbol, create_type
from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.node_collection import NodeCollection
from pystencils.transformations import ( from pystencils.transformations import (
add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering, filtered_tree_iteration, iterate_loops_by_depth, get_base_buffer_index, get_optimal_loop_ordering,
implement_interpolations, make_loop_over_domain, move_constants_before_loop, make_loop_over_domain, add_outer_loop_over_indexed_elements,
parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, split_inner_loop) move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
resolve_field_accesses, split_inner_loop)
AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]]
def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "kernel", type_info='double', def create_kernel(assignments: NodeCollection,
split_groups=(), iteration_slice=None, ghost_layers=None, config: CreateKernelConfig) -> KernelFunction:
skip_independence_check=False) -> KernelFunction:
"""Creates an abstract syntax tree for a kernel function, by taking a list of update rules. """Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations. Loops are created according to the field accesses in the equations.
...@@ -26,35 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -26,35 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
Args: Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`. assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out config: create kernel config
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
that should be excluded from the iteration.
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
Returns: Returns:
AST node representing a function, that can be printed as C or CUDA code AST node representing a function, that can be printed as C or CUDA code
""" """
def type_symbol(term): function_name = config.function_name
if isinstance(term, Field.Access) or isinstance(term, TypedSymbol): iteration_slice = config.iteration_slice
return term ghost_layers = config.ghost_layers
elif isinstance(term, sp.Symbol): fields_written = assignments.bound_fields
if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'): fields_read = assignments.rhs_fields
return TypedSymbol(term.name, create_type(type_info))
else: split_groups = ()
return TypedSymbol(term.name, type_info[term.name]) if 'split_groups' in assignments.simplification_hints:
else: split_groups = assignments.simplification_hints['split_groups']
raise ValueError("Term has to be field access or symbol") assignments = assignments.all_assignments
# TODO Cleanup: move add_types to create_domain_kernel or create_kernel
assignments = add_types(assignments, config)
fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
...@@ -65,15 +54,31 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -65,15 +54,31 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
loop_order = get_optimal_loop_ordering(fields_without_buffers) loop_order = get_optimal_loop_ordering(fields_without_buffers)
loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice, loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice,
ghost_layers=ghost_layers, loop_order=loop_order) ghost_layers=ghost_layers, loop_order=loop_order)
ast_node = KernelFunction(loop_node, 'cpu', 'c', compile_function=make_python_function, loop_node = add_outer_loop_over_indexed_elements(loop_node)
ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments) ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)
implement_interpolations(body)
if split_groups: if split_groups:
type_info = config.data_type
def type_symbol(term):
if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
return term
elif isinstance(term, sp.Symbol):
if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'):
return TypedSymbol(term.name, create_type(type_info))
else:
return TypedSymbol(term.name, type_info[term.name])
else:
raise ValueError("Term has to be field access or symbol")
typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups] typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
split_inner_loop(ast_node, typed_split_groups) split_inner_loop(ast_node, typed_split_groups)
base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']] base_pointer_spec = config.base_pointer_specification
if base_pointer_spec is None:
base_pointer_spec = []
base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order, base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
field.spatial_dimensions, field.index_dimensions) field.spatial_dimensions, field.index_dimensions)
for field in fields_without_buffers} for field in fields_without_buffers}
...@@ -85,13 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -85,13 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
if any(FieldType.is_buffer(f) for f in all_fields): if any(FieldType.is_buffer(f) for f in all_fields):
resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields) resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields)
# TODO think about typing
resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info) resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info)
move_constants_before_loop(ast_node) move_constants_before_loop(ast_node)
return ast_node return ast_node
def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, function_name="kernel", def create_indexed_kernel(assignments: NodeCollection,
type_info=None, coordinate_names=('x', 'y', 'z')) -> KernelFunction: config: CreateKernelConfig) -> KernelFunction:
""" """
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
...@@ -103,31 +109,39 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -103,31 +109,39 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
Args: Args:
assignments: list of assignments assignments: list of assignments
index_fields: list of index fields, i.e. 1D fields with struct data type config: Kernel configuration
type_info: see documentation of :func:`create_kernel`
function_name: see documentation of :func:`create_kernel`
coordinate_names: name of the coordinate fields in the struct data type
""" """
fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False) function_name = config.function_name
index_fields = config.index_fields
coordinate_names = config.coordinate_names
fields_written = assignments.bound_fields
fields_read = assignments.rhs_fields
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
# extract the index fields based on the name. The original index field might have been modified
index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
f"Non index fields are {non_index_fields}, spatial coordinates are " \
f"{spatial_coordinates}"
spatial_coordinates = list(spatial_coordinates)[0]
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
for index_field in index_fields: for index_field in index_fields:
index_field.field_type = FieldType.INDEXED index_field.field_type = FieldType.INDEXED
assert FieldType.is_indexed(index_field) assert FieldType.is_indexed(index_field)
assert index_field.spatial_dimensions == 1, "Index fields have to be 1D" assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
spatial_coordinates = list(spatial_coordinates)[0]
def get_coordinate_symbol_assignment(name): def get_coordinate_symbol_assignment(name):
for idx_field in index_fields: for idx_field in index_fields:
assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type" assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type"
data_type = idx_field.dtype data_type = idx_field.dtype
if data_type.has_element(name): if data_type.has_element(name):
rhs = idx_field[0](name) rhs = idx_field[0](name)
lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name))) lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs) return SympyAssignment(lhs, rhs)
raise ValueError(f"Index {name} not found in any of the passed index fields") raise ValueError(f"Index {name} not found in any of the passed index fields")
...@@ -140,13 +154,11 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -140,13 +154,11 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
loop_body = Block([]) loop_body = Block([])
loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0]) loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0])
implement_interpolations(loop_node)
for assignment in assignments: for assignment in assignments:
loop_body.append(assignment) loop_body.append(assignment)
function_body = Block([loop_node]) function_body = Block([loop_node])
ast_node = KernelFunction(function_body, "cpu", "c", make_python_function, ast_node = KernelFunction(function_body, Target.CPU, Backend.C, make_python_function,
ghost_layers=None, function_name=function_name, assignments=assignments) ghost_layers=None, function_name=function_name, assignments=assignments)
fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields} fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields}
...@@ -202,5 +214,20 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, ass ...@@ -202,5 +214,20 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, ass
prefix = f"#pragma omp for schedule({schedule})" prefix = f"#pragma omp for schedule({schedule})"
if collapse: if collapse:
prefix += " collapse(%d)" % (collapse, ) prefix += f" collapse({collapse})"
loop_to_parallelize.prefix_lines.append(prefix) loop_to_parallelize.prefix_lines.append(prefix)
def add_pragmas(ast_node, pragma_lines, nesting_depth=-1):
"""Prepends given pragma lines to all loops of specified nesting depth.
Args:
ast_node: pystencils abstract syntax tree
pragma_lines: Iterable of strings containing the pragma lines
nesting_depth: Nesting depth of the loops the pragmas should be applied to.
Outermost loop has depth 0.
A depth of -1 indicates the innermost loops.
"""
loop_nodes = iterate_loops_by_depth(ast_node, nesting_depth)
for n in loop_nodes:
n.prefix_lines += list(pragma_lines)
...@@ -3,13 +3,13 @@ from typing import Container, Union ...@@ -3,13 +3,13 @@ from typing import Container, Union
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from sympy.logic.boolalg import BooleanFunction from sympy.logic.boolalg import BooleanFunction, BooleanAtom
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.data_types import ( from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression, vector_memory_access) get_type_of_expression, VectorMemoryAccess)
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.functions import DivFunc
from pystencils.field import Field from pystencils.field import Field
from pystencils.integer_functions import modulo_ceil, modulo_floor from pystencils.integer_functions import modulo_ceil, modulo_floor
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
...@@ -26,9 +26,58 @@ class vec_all(sp.Function): ...@@ -26,9 +26,58 @@ class vec_all(sp.Function):
nargs = (1,) nargs = (1,)
class NontemporalFence(ast.Node):
def __init__(self):
super(NontemporalFence, self).__init__(parent=None)
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, NontemporalFence)
class CachelineSize(ast.Node):
symbol = sp.Symbol("_clsize")
mask_symbol = sp.Symbol("_clsize_mask")
last_symbol = sp.Symbol("_cl_lastvec")
def __init__(self):
super(CachelineSize, self).__init__(parent=None)
@property
def symbols_defined(self):
return {self.symbol, self.mask_symbol, self.last_symbol}
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, CachelineSize)
def __hash__(self):
return hash(self.symbol)
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False, assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True): assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
# TODO Vectorization Revamp we first introduce the remainder loop and then check if we can even vectorise.
# Maybe first copy the ast and return the copied version on failure
"""Explicit vectorization using SIMD vectorization via intrinsics. """Explicit vectorization using SIMD vectorization via intrinsics.
Args: Args:
...@@ -74,57 +123,52 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', ...@@ -74,57 +123,52 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
"to differently typed floating point fields") "to differently typed floating point fields")
float_size = field_float_dtypes.pop().numpy_dtype.itemsize float_size = field_float_dtypes.pop().numpy_dtype.itemsize
assert float_size in (8, 4) assert float_size in (8, 4)
vector_is = get_vector_instruction_set('double' if float_size == 8 else 'float', default_float_type = 'float64' if float_size == 8 else 'float32'
instruction_set=instruction_set) vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
vector_width = vector_is['width']
kernel_ast.instruction_set = vector_is kernel_ast.instruction_set = vector_is
vectorize_rng(kernel_ast, vector_width) if nontemporal and 'cachelineZero' in vector_is:
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, kernel_ast.use_all_written_field_sizes = True
nontemporal, assume_sufficient_line_padding) strided = 'storeS' in vector_is and 'loadS' in vector_is
insert_vector_casts(kernel_ast) keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
strided, keep_loop_stop, assume_sufficient_line_padding,
default_float_type)
def vectorize_rng(kernel_ast, vector_width): def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
"""Replace scalar result symbols on RNG nodes with vectorial ones""" strided, keep_loop_stop, assume_sufficient_line_padding,
from pystencils.rng import RNGBase default_float_type):
subst = {}
def visit_node(node):
for arg in node.args:
if isinstance(arg, RNGBase):
new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
for s in arg.result_symbols]
subst.update({s[0]: s[1] for s in zip(arg.result_symbols, new_result_symbols)})
arg._symbols_defined = set(new_result_symbols)
else:
visit_node(arg)
visit_node(kernel_ast)
fast_subs(kernel_ast.body, subst, skip=lambda e: isinstance(e, RNGBase))
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
assume_sufficient_line_padding):
"""Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type.""" """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
inner_loops = [n for n in all_loops if n.is_innermost_loop] inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops} zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
vector_is = ast_node.instruction_set
assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
vector_width = vector_is['width']
vector_int_width = vector_is['intwidth']
for loop_node in inner_loops: for loop_node in inner_loops:
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
# cut off loop tail, that is not a multiple of four # cut off loop tail, that is not a multiple of four
if assume_aligned and assume_sufficient_line_padding: if keep_loop_stop:
pass
elif assume_aligned and assume_sufficient_line_padding:
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
new_stop = loop_node.start + modulo_ceil(loop_range, vector_width) new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
loop_node.stop = new_stop loop_node.stop = new_stop
else: else:
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)] # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
if isinstance(loop, ast.LoopOverCoordinate)]
assert len(loop_nodes) in (0, 1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width assert len(loop_nodes) in (0, 1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width
if len(loop_nodes) == 0: if len(loop_nodes) == 0:
continue continue
loop_node = loop_nodes[0] loop_node = loop_nodes[0]
# loop_node is the vectorized one
# Find all array accesses (indexed) that depend on the loop counter as offset # Find all array accesses (indexed) that depend on the loop counter as offset
loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over) loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
...@@ -133,41 +177,63 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a ...@@ -133,41 +177,63 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
for indexed in loop_node.atoms(sp.Indexed): for indexed in loop_node.atoms(sp.Indexed):
base, index = indexed.args base, index = indexed.args
if loop_counter_symbol in index.atoms(sp.Symbol): if loop_counter_symbol in index.atoms(sp.Symbol):
if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
# don't need to generate the alignment check when there are no aligned load/store instructions
aligned_access = False
else:
if not isinstance(vector_width, int):
raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
'vector ISAs')
aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms() loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0 stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
if not loop_counter_is_offset: if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
successful = False successful = False
break break
typed_symbol = base.label typed_symbol = base.label
assert type(typed_symbol.dtype) is PointerType, \ assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"
f"Type of access is {typed_symbol.dtype}, {indexed}"
vec_type = VectorType(typed_symbol.dtype.base_type, vector_width) vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
use_aligned_access = aligned_access and assume_aligned use_aligned_access = aligned_access and assume_aligned
nontemporal = False nontemporal = False
if hasattr(indexed, 'field'): if hasattr(indexed, 'field'):
nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields) nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True) substitutions[indexed] = VectorMemoryAccess(indexed, vec_type, use_aligned_access, nontemporal, True,
stride if strided else 1)
if nontemporal: if nontemporal:
# insert NontemporalFence after the outermost loop
parent = loop_node.parent parent = loop_node.parent
while type(parent.parent.parent) is not ast.KernelFunction: while type(parent.parent.parent) is not ast.KernelFunction:
parent = parent.parent parent = parent.parent
parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True) parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
# insert CachelineSize at the beginning of the kernel
parent.parent.insert_front(CachelineSize(), if_not_exists=True)
if not successful: if not successful:
warnings.warn("Could not vectorize loop because of non-consecutive memory access") warnings.warn("Could not vectorize loop because of non-consecutive memory access")
continue continue
loop_node.step = vector_width loop_node.step = vector_width
loop_node.subs(substitutions) loop_node.subs(substitutions)
vector_int_width = ast_node.instruction_set['intwidth'] arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
vector_loop_counter = cast_func(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width)) \ arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
+ cast_func(tuple(range(vector_int_width)), VectorType(loop_counter_symbol.dtype, vector_int_width)) VectorType(loop_counter_symbol.dtype, vector_int_width))
vector_loop_counter = arg_1 + arg_2
fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter}, fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, vector_memory_access)) skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, VectorMemoryAccess))
mask_conditionals(loop_node) mask_conditionals(loop_node)
from pystencils.rng import RNGBase
substitutions = {}
for rng in loop_node.atoms(RNGBase):
new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
for s in rng.result_symbols]
substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
rng._symbols_defined = set(new_result_symbols)
fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
insert_vector_casts(loop_node, vector_is, default_float_type)
def mask_conditionals(loop_body): def mask_conditionals(loop_body):
def visit_node(node, mask): def visit_node(node, mask):
...@@ -185,8 +251,8 @@ def mask_conditionals(loop_body): ...@@ -185,8 +251,8 @@ def mask_conditionals(loop_body):
node.condition_expr = vec_any(node.condition_expr) node.condition_expr = vec_any(node.condition_expr)
elif isinstance(node, ast.SympyAssignment): elif isinstance(node, ast.SympyAssignment):
if mask is not True: if mask is not True:
s = {ma: vector_memory_access(ma.args[0], ma.args[1], ma.args[2], ma.args[3], sp.And(mask, ma.args[4])) s = {ma: VectorMemoryAccess(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
for ma in node.atoms(vector_memory_access)} for ma in node.atoms(VectorMemoryAccess)}
node.subs(s) node.subs(s)
else: else:
for arg in node.args: for arg in node.args:
...@@ -195,30 +261,55 @@ def mask_conditionals(loop_body): ...@@ -195,30 +261,55 @@ def mask_conditionals(loop_body):
visit_node(loop_body, mask=True) visit_node(loop_body, mask=True)
def insert_vector_casts(ast_node): def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
"""Inserts necessary casts from scalar values to vector values.""" """Inserts necessary casts from scalar values to vector values."""
handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all) handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
def visit_expr(expr): def is_scalar(expr) -> bool:
if isinstance(expr, vector_memory_access): if hasattr(expr, "dtype"):
return vector_memory_access(expr.args[0], expr.args[1], expr.args[2], expr.args[3], if type(expr.dtype) is VectorType:
visit_expr(expr.args[4])) return False
elif isinstance(expr, cast_func): # Else branch: If expr is a CastFunc, then whether the expression
return expr # is scalar is determined by the argument (remember: vector casts
elif expr.func is sp.Abs and 'abs' not in ast_node.instruction_set: # are not inserted yet). Therefore, we must recurse into the args of
new_arg = visit_expr(expr.args[0]) # expr below. Otherwise, this expression is atomic and in that case
base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is vector_memory_access \ # it is assumed to be scalar below.
if isinstance(expr, ast.ResolvedFieldAccess):
# expr.field is not in expr.args
return is_scalar(expr.field)
elif isinstance(expr, (vec_any, vec_all)):
return True
if not hasattr(expr, "args"):
return True
return all(is_scalar(arg) for arg in expr.args)
# TODO Vectorization Revamp: get rid of default_type
def visit_expr(expr, default_type='double', force_vectorize=False):
if isinstance(expr, VectorMemoryAccess):
return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
*expr.args[5:])
elif isinstance(expr, CastFunc):
cast_type = expr.args[1]
arg = visit_expr(expr.args[0], default_type, force_vectorize)
assert cast_type in [BasicType('float32'), BasicType('float64')], \
f'Vectorization cannot vectorize type {cast_type}'
return expr.func(arg, VectorType(cast_type, instruction_set['width']))
elif expr.func is sp.Abs and 'abs' not in instruction_set:
new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
else get_type_of_expression(expr.args[0]) else get_type_of_expression(expr.args[0])
pw = sp.Piecewise((-new_arg, new_arg < base_type.numpy_dtype.type(0)), pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
(new_arg, True)) (new_arg, True))
return visit_expr(pw) return visit_expr(pw, default_type, force_vectorize)
elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction): elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
default_type = 'double'
if expr.func is sp.Mul and expr.args[0] == -1: if expr.func is sp.Mul and expr.args[0] == -1:
# special treatment for the unary minus: make sure that the -1 has the same type as the argument # special treatment for the unary minus: make sure that the -1 has the same type as the argument
dtype = int dtype = int
for arg in expr.atoms(vector_memory_access): for arg in expr.atoms(VectorMemoryAccess):
if arg.dtype.base_type.is_float(): if arg.dtype.base_type.is_float():
dtype = arg.dtype.base_type.numpy_dtype.type dtype = arg.dtype.base_type.numpy_dtype.type
for arg in expr.atoms(TypedSymbol): for arg in expr.atoms(TypedSymbol):
...@@ -228,22 +319,42 @@ def insert_vector_casts(ast_node): ...@@ -228,22 +319,42 @@ def insert_vector_casts(ast_node):
if dtype is np.float32: if dtype is np.float32:
default_type = 'float' default_type = 'float'
expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:]) expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
new_args = [visit_expr(a) for a in expr.args] new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args] arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
if not any(type(t) is VectorType for t in arg_types): if not any(type(t) is VectorType for t in arg_types):
return expr return expr
else: else:
target_type = collate_types(arg_types) target_type = collate_types(arg_types)
casted_args = [ casted_args = [
cast_func(a, target_type) if t != target_type and not isinstance(a, vector_memory_access) else a CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
for a, t in zip(new_args, arg_types)] for a, t in zip(new_args, arg_types)]
return expr.func(*casted_args) return expr.func(*casted_args)
elif expr.func is sp.UnevaluatedExpr:
assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
# TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
# TODO be fixed for real at some point.
if expr.args[0].is_Pow:
base = expr.args[0].base
exp = expr.args[0].exp
expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
target_type = collate_types(arg_types)
if not any(type(t) is VectorType for t in arg_types):
target_type = VectorType(target_type, instruction_set['width'])
casted_args = [
CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
for a, t in zip(new_args, arg_types)]
return expr.func(expr.args[0].func(*casted_args, evaluate=False))
elif expr.func is sp.Pow: elif expr.func is sp.Pow:
new_arg = visit_expr(expr.args[0]) new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
return expr.func(new_arg, expr.args[1]) return expr.func(new_arg, expr.args[1])
elif expr.func == sp.Piecewise: elif expr.func == sp.Piecewise:
new_results = [visit_expr(a[0]) for a in expr.args] new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
new_conditions = [visit_expr(a[1]) for a in expr.args] new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
types_of_results = [get_type_of_expression(a) for a in new_results] types_of_results = [get_type_of_expression(a) for a in new_results]
types_of_conditions = [get_type_of_expression(a) for a in new_conditions] types_of_conditions = [get_type_of_expression(a) for a in new_conditions]
...@@ -254,41 +365,61 @@ def insert_vector_casts(ast_node): ...@@ -254,41 +365,61 @@ def insert_vector_casts(ast_node):
if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType: if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
condition_target_type = VectorType(condition_target_type, width=result_target_type.width) condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
casted_results = [cast_func(a, result_target_type) if t != result_target_type else a casted_results = [CastFunc(a, result_target_type) if t != result_target_type else a
for a, t in zip(new_results, types_of_results)] for a, t in zip(new_results, types_of_results)]
casted_conditions = [cast_func(a, condition_target_type) casted_conditions = [CastFunc(a, condition_target_type)
if t != condition_target_type and a is not True else a if t != condition_target_type and a is not True else a
for a, t in zip(new_conditions, types_of_conditions)] for a, t in zip(new_conditions, types_of_conditions)]
return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)]) return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
else: elif isinstance(expr, TypedSymbol):
if force_vectorize:
expr_type = get_type_of_expression(expr)
if type(expr_type) is not VectorType:
vector_type = VectorType(expr_type, instruction_set['width'])
return CastFunc(expr, vector_type)
return expr
elif isinstance(expr, (sp.Number, BooleanAtom)):
return expr return expr
else:
raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
f'The expression {expr} of type {type(expr)} is not known yet.')
def visit_node(node, substitution_dict): def visit_node(node, substitution_dict, default_type='double'):
substitution_dict = substitution_dict.copy() substitution_dict = substitution_dict.copy()
for arg in node.args: for arg in node.args:
if isinstance(arg, ast.SympyAssignment): if isinstance(arg, ast.SympyAssignment):
assignment = arg assignment = arg
# If there is a remainder loop we do not vectorise it, thus lhs will indicate this
# if isinstance(assignment.lhs, ast.ResolvedFieldAccess):
# continue
subs_expr = fast_subs(assignment.rhs, substitution_dict, subs_expr = fast_subs(assignment.rhs, substitution_dict,
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess)) skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
assignment.rhs = visit_expr(subs_expr)
rhs_type = get_type_of_expression(assignment.rhs) # If either side contains a vectorized subexpression, both sides
# must be fully vectorized.
lhs_scalar = is_scalar(assignment.lhs)
rhs_scalar = is_scalar(subs_expr)
assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
if isinstance(assignment.lhs, TypedSymbol): if isinstance(assignment.lhs, TypedSymbol):
lhs_type = assignment.lhs.dtype if lhs_scalar and not rhs_scalar:
if type(rhs_type) is VectorType and type(lhs_type) is not VectorType: lhs_type = get_type_of_expression(assignment.lhs)
rhs_type = get_type_of_expression(assignment.rhs)
new_lhs_type = VectorType(lhs_type, rhs_type.width) new_lhs_type = VectorType(lhs_type, rhs_type.width)
new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type) new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
substitution_dict[assignment.lhs] = new_lhs substitution_dict[assignment.lhs] = new_lhs
assignment.lhs = new_lhs assignment.lhs = new_lhs
elif isinstance(assignment.lhs, vector_memory_access): elif isinstance(assignment.lhs, VectorMemoryAccess):
assignment.lhs = visit_expr(assignment.lhs) assignment.lhs = visit_expr(assignment.lhs, default_type)
elif isinstance(arg, ast.Conditional): elif isinstance(arg, ast.Conditional):
arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict, arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict,
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess)) skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
arg.condition_expr = visit_expr(arg.condition_expr) arg.condition_expr = visit_expr(arg.condition_expr, default_type)
visit_node(arg, substitution_dict) visit_node(arg, substitution_dict, default_type)
else: else:
visit_node(arg, substitution_dict) visit_node(arg, substitution_dict, default_type)
visit_node(ast_node, {}) visit_node(ast_node, {}, default_float_type)
import warnings
from typing import Tuple, Union from typing import Tuple, Union
from .datahandling_interface import DataHandling from .datahandling_interface import DataHandling
from ..enums import Target
from .serial_datahandling import SerialDataHandling from .serial_datahandling import SerialDataHandling
try: try:
...@@ -18,10 +21,10 @@ except ImportError: ...@@ -18,10 +21,10 @@ except ImportError:
def create_data_handling(domain_size: Tuple[int, ...], def create_data_handling(domain_size: Tuple[int, ...],
periodicity: Union[bool, Tuple[bool, ...]] = False, periodicity: Union[bool, Tuple[bool, ...]] = False,
default_layout: str = 'SoA', default_layout: str = 'SoA',
default_target: str = 'cpu', default_target: Target = Target.CPU,
parallel: bool = False, parallel: bool = False,
default_ghost_layers: int = 1, default_ghost_layers: int = 1,
opencl_queue=None) -> DataHandling: device_number: Union[int, None] = None) -> DataHandling:
"""Creates a data handling instance. """Creates a data handling instance.
Args: Args:
...@@ -29,12 +32,20 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -29,12 +32,20 @@ def create_data_handling(domain_size: Tuple[int, ...],
periodicity: either True, False for full or no periodicity or a tuple of booleans indicating periodicity periodicity: either True, False for full or no periodicity or a tuple of booleans indicating periodicity
for each coordinate for each coordinate
default_layout: default array layout, that is used if not explicitly specified in 'add_array' default_layout: default array layout, that is used if not explicitly specified in 'add_array'
default_target: either 'cpu' or 'gpu' default_target: `Target`
parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
specified. If none is given, the device with the largest amount of memory is used. If multiple
devices have the same amount of memory, the one with the lower number is used
""" """
if isinstance(default_target, str):
new_target = Target[default_target.upper()]
warnings.warn(f'Target "{default_target}" as str is deprecated. Use {new_target} instead',
category=DeprecationWarning)
default_target = new_target
if parallel: if parallel:
assert not opencl_queue, "OpenCL is only supported for SerialDataHandling"
if wlb is None: if wlb is None:
raise ValueError("Cannot create parallel data handling because walberla module is not available") raise ValueError("Cannot create parallel data handling because walberla module is not available")
...@@ -63,7 +74,7 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -63,7 +74,7 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_target=default_target, default_target=default_target,
default_layout=default_layout, default_layout=default_layout,
default_ghost_layers=default_ghost_layers, default_ghost_layers=default_ghost_layers,
opencl_queue=opencl_queue) device_number=device_number)
__all__ = ['create_data_handling'] __all__ = ['create_data_handling']
...@@ -115,7 +115,7 @@ class ParallelBlock(Block): ...@@ -115,7 +115,7 @@ class ParallelBlock(Block):
result = wlb.field.toArray(result, with_ghost_layers=self._gls) result = wlb.field.toArray(result, with_ghost_layers=self._gls)
result = self._normalize_array_shape(result) result = self._normalize_array_shape(result)
elif 'GpuField' in type_name: elif 'GpuField' in type_name:
result = wlb.cuda.toGpuArray(result, with_ghost_layers=self._gls) result = wlb.gpu.toGpuArray(result, with_ghost_layers=self._gls)
result = self._normalize_array_shape(result) result = self._normalize_array_shape(result)
return result return result
......
...@@ -3,6 +3,7 @@ from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple, Union ...@@ -3,6 +3,7 @@ from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
import numpy as np import numpy as np
from pystencils.enums import Target, Backend
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
...@@ -16,10 +17,14 @@ class DataHandling(ABC): ...@@ -16,10 +17,14 @@ class DataHandling(ABC):
'gather' function that has collects (parts of the) distributed data on a single process. 'gather' function that has collects (parts of the) distributed data on a single process.
""" """
_GPU_LIKE_TARGETS = ['gpu', 'opencl'] _GPU_LIKE_TARGETS = [Target.GPU]
_GPU_LIKE_BACKENDS = ['gpucuda', 'opencl'] _GPU_LIKE_BACKENDS = [Backend.CUDA]
# ---------------------------- Adding and accessing data ----------------------------------------------------------- # ---------------------------- Adding and accessing data -----------------------------------------------------------
@property
@abstractmethod
def default_target(self) -> Target:
"""Target Enum indicating the target of the computation"""
@property @property
@abstractmethod @abstractmethod
...@@ -56,7 +61,7 @@ class DataHandling(ABC): ...@@ -56,7 +61,7 @@ class DataHandling(ABC):
layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'. layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'.
this is only important if values_per_cell > 1 this is only important if values_per_cell > 1
cpu: allocate field on the CPU cpu: allocate field on the CPU
gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'gpu' gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'GPU'
alignment: either False for no alignment, or the number of bytes to align to alignment: either False for no alignment, or the number of bytes to align to
Returns: Returns:
pystencils field, that can be used to formulate symbolic kernels pystencils field, that can be used to formulate symbolic kernels
...@@ -91,7 +96,7 @@ class DataHandling(ABC): ...@@ -91,7 +96,7 @@ class DataHandling(ABC):
layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'. layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'.
this is only important if values_per_cell > 1 this is only important if values_per_cell > 1
cpu: allocate field on the CPU cpu: allocate field on the CPU
gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'gpu' gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'GPU'
alignment: either False for no alignment, or the number of bytes to align to alignment: either False for no alignment, or the number of bytes to align to
Returns: Returns:
Fields representing the just created arrays Fields representing the just created arrays
...@@ -280,7 +285,7 @@ class DataHandling(ABC): ...@@ -280,7 +285,7 @@ class DataHandling(ABC):
names: what data to synchronize: name of array or sequence of names names: what data to synchronize: name of array or sequence of names
stencil: stencil as string defining which neighbors are synchronized e.g. 'D2Q9', 'D3Q19' stencil: stencil as string defining which neighbors are synchronized e.g. 'D2Q9', 'D3Q19'
if None, a full synchronization (i.e. D2Q9 or D3Q27) is done if None, a full synchronization (i.e. D2Q9 or D3Q27) is done
target: either 'cpu' or 'gpu target: `Target` either 'CPU' or 'GPU'
kwargs: implementation specific, optional optimization parameters for communication kwargs: implementation specific, optional optimization parameters for communication
Returns: Returns:
...@@ -326,6 +331,7 @@ class DataHandling(ABC): ...@@ -326,6 +331,7 @@ class DataHandling(ABC):
b[array_name][(Ellipsis, *value_idx)].fill(val) b[array_name][(Ellipsis, *value_idx)].fill(val)
else: else:
b[array_name].fill(val) b[array_name].fill(val)
self.to_gpu(array_name)
def min(self, array_name, slice_obj=None, ghost_layers=False, inner_ghost_layers=False, reduce=True): def min(self, array_name, slice_obj=None, ghost_layers=False, inner_ghost_layers=False, reduce=True):
"""Returns the minimum value inside the domain or slice of the domain. """Returns the minimum value inside the domain or slice of the domain.
......
...@@ -7,16 +7,18 @@ import waLBerla as wlb ...@@ -7,16 +7,18 @@ import waLBerla as wlb
from pystencils.datahandling.blockiteration import block_iteration, sliced_block_iteration from pystencils.datahandling.blockiteration import block_iteration, sliced_block_iteration
from pystencils.datahandling.datahandling_interface import DataHandling from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.enums import Backend
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing.typed_sympy import FieldPointerSymbol
from pystencils.utils import DotDict from pystencils.utils import DotDict
from pystencils import Target
class ParallelDataHandling(DataHandling): class ParallelDataHandling(DataHandling):
GPU_DATA_PREFIX = "gpu_" GPU_DATA_PREFIX = "gpu_"
VTK_COUNTER = 0 VTK_COUNTER = 0
def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, default_target='cpu'): def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, default_target=Target.CPU):
""" """
Creates data handling based on walberla block storage Creates data handling based on walberla block storage
...@@ -27,18 +29,19 @@ class ParallelDataHandling(DataHandling): ...@@ -27,18 +29,19 @@ class ParallelDataHandling(DataHandling):
dim: dimension of scenario, dim: dimension of scenario,
walberla always uses three dimensions, so if dim=2 the extend of the walberla always uses three dimensions, so if dim=2 the extend of the
z coordinate of blocks has to be 1 z coordinate of blocks has to be 1
default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated default_target: `Target`, either 'CPU' or 'GPU' . If set to 'GPU' for each array also a GPU version is
if not overwritten in add_array, and synchronization functions are for the GPU by default allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default
""" """
super(ParallelDataHandling, self).__init__() super(ParallelDataHandling, self).__init__()
assert dim in (2, 3) assert dim in (2, 3)
self.blocks = blocks self._blocks = blocks
self.default_ghost_layers = default_ghost_layers self._default_ghost_layers = default_ghost_layers
self.default_layout = default_layout self._default_layout = default_layout
self._fields = DotDict() # maps name to symbolic pystencils field self._fields = DotDict() # maps name to symbolic pystencils field
self._field_name_to_cpu_data_name = {} self._field_name_to_cpu_data_name = {}
self._field_name_to_gpu_data_name = {} self._field_name_to_gpu_data_name = {}
self.data_names = set() self._data_names = set()
self._dim = dim self._dim = dim
self._fieldInformation = {} self._fieldInformation = {}
self._cpu_gpu_pairs = [] self._cpu_gpu_pairs = []
...@@ -52,7 +55,11 @@ class ParallelDataHandling(DataHandling): ...@@ -52,7 +55,11 @@ class ParallelDataHandling(DataHandling):
if self._dim == 2: if self._dim == 2:
assert self.blocks.getDomainCellBB().size[2] == 1 assert self.blocks.getDomainCellBB().size[2] == 1
self.default_target = default_target self._default_target = default_target
@property
def default_target(self):
return self._default_target
@property @property
def dim(self): def dim(self):
...@@ -70,6 +77,22 @@ class ParallelDataHandling(DataHandling): ...@@ -70,6 +77,22 @@ class ParallelDataHandling(DataHandling):
def fields(self): def fields(self):
return self._fields return self._fields
@property
def blocks(self):
return self._blocks
@property
def default_ghost_layers(self):
return self._default_ghost_layers
@property
def default_layout(self):
return self._default_layout
@property
def data_names(self):
return self.data_names
def ghost_layers_of_field(self, name): def ghost_layers_of_field(self, name):
return self._fieldInformation[name]['ghost_layers'] return self._fieldInformation[name]['ghost_layers']
...@@ -94,7 +117,7 @@ class ParallelDataHandling(DataHandling): ...@@ -94,7 +117,7 @@ class ParallelDataHandling(DataHandling):
if ghost_layers is None: if ghost_layers is None:
ghost_layers = self.default_ghost_layers ghost_layers = self.default_ghost_layers
if gpu is None: if gpu is None:
gpu = self.default_target == 'gpu' gpu = self.default_target == Target.GPU
if layout is None: if layout is None:
layout = self.default_layout layout = self.default_layout
if len(self.blocks) == 0: if len(self.blocks) == 0:
...@@ -128,8 +151,8 @@ class ParallelDataHandling(DataHandling): ...@@ -128,8 +151,8 @@ class ParallelDataHandling(DataHandling):
if gpu: if gpu:
if alignment != 0: if alignment != 0:
raise ValueError("Alignment for walberla GPU fields not yet supported") raise ValueError("Alignment for walberla GPU fields not yet supported")
wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell, wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout]) usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
if cpu and gpu: if cpu and gpu:
self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name)) self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
...@@ -230,9 +253,9 @@ class ParallelDataHandling(DataHandling): ...@@ -230,9 +253,9 @@ class ParallelDataHandling(DataHandling):
kernel_function(**arg_dict) kernel_function(**arg_dict)
def get_kernel_kwargs(self, kernel_function, **kwargs): def get_kernel_kwargs(self, kernel_function, **kwargs):
if kernel_function.ast.backend == 'gpucuda': if kernel_function.ast.backend == Backend.CUDA:
name_map = self._field_name_to_gpu_data_name name_map = self._field_name_to_gpu_data_name
to_array = wlb.cuda.toGpuArray to_array = wlb.gpu.toGpuArray
else: else:
name_map = self._field_name_to_cpu_data_name name_map = self._field_name_to_cpu_data_name
to_array = wlb.field.toArray to_array = wlb.field.toArray
...@@ -257,7 +280,8 @@ class ParallelDataHandling(DataHandling): ...@@ -257,7 +280,8 @@ class ParallelDataHandling(DataHandling):
for block in self.blocks: for block in self.blocks:
transfer_func(block[self.GPU_DATA_PREFIX + name], block[name]) transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
else: else:
wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name) if self.is_on_gpu(name):
wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
def to_gpu(self, name): def to_gpu(self, name):
if name in self._custom_data_transfer_functions: if name in self._custom_data_transfer_functions:
...@@ -265,28 +289,29 @@ class ParallelDataHandling(DataHandling): ...@@ -265,28 +289,29 @@ class ParallelDataHandling(DataHandling):
for block in self.blocks: for block in self.blocks:
transfer_func(block[self.GPU_DATA_PREFIX + name], block[name]) transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
else: else:
wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name) if self.is_on_gpu(name):
wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
def is_on_gpu(self, name): def is_on_gpu(self, name):
return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs
def all_to_cpu(self): def all_to_cpu(self):
for cpu_name, gpu_name in self._cpu_gpu_pairs: for cpu_name, gpu_name in self._cpu_gpu_pairs:
wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name) wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
for name in self._custom_data_transfer_functions.keys(): for name in self._custom_data_transfer_functions.keys():
self.to_cpu(name) self.to_cpu(name)
def all_to_gpu(self): def all_to_gpu(self):
for cpu_name, gpu_name in self._cpu_gpu_pairs: for cpu_name, gpu_name in self._cpu_gpu_pairs:
wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name) wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
for name in self._custom_data_transfer_functions.keys(): for name in self._custom_data_transfer_functions.keys():
self.to_gpu(name) self.to_gpu(name)
def synchronization_function_cpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_): def synchronization_function_cpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
return self.synchronization_function(names, stencil, 'cpu', buffered, stencil_restricted) return self.synchronization_function(names, stencil, Target.CPU, buffered, stencil_restricted)
def synchronization_function_gpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_): def synchronization_function_gpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
return self.synchronization_function(names, stencil, 'gpu', buffered, stencil_restricted) return self.synchronization_function(names, stencil, Target.GPU, buffered, stencil_restricted)
def synchronization_function(self, names, stencil=None, target=None, buffered=True, stencil_restricted=False): def synchronization_function(self, names, stencil=None, target=None, buffered=True, stencil_restricted=False):
if target is None: if target is None:
...@@ -299,13 +324,13 @@ class ParallelDataHandling(DataHandling): ...@@ -299,13 +324,13 @@ class ParallelDataHandling(DataHandling):
names = [names] names = [names]
create_scheme = wlb.createUniformBufferedScheme if buffered else wlb.createUniformDirectScheme create_scheme = wlb.createUniformBufferedScheme if buffered else wlb.createUniformDirectScheme
if target == 'cpu': if target == Target.CPU:
create_packing = wlb.field.createPackInfo if buffered else wlb.field.createMPIDatatypeInfo create_packing = wlb.field.createPackInfo if buffered else wlb.field.createMPIDatatypeInfo
if buffered and stencil_restricted: if buffered and stencil_restricted:
create_packing = wlb.field.createStencilRestrictedPackInfo create_packing = wlb.field.createStencilRestrictedPackInfo
else: else:
assert target == 'gpu' assert target == Target.GPU
create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
names = [self.GPU_DATA_PREFIX + name for name in names] names = [self.GPU_DATA_PREFIX + name for name in names]
sync_function = create_scheme(self.blocks, stencil) sync_function = create_scheme(self.blocks, stencil)
......
...@@ -6,11 +6,10 @@ import numpy as np ...@@ -6,11 +6,10 @@ import numpy as np
from pystencils.datahandling.blockiteration import SerialBlock from pystencils.datahandling.blockiteration import SerialBlock
from pystencils.datahandling.datahandling_interface import DataHandling from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler from pystencils.enums import Target
from pystencils.datahandling.pyopencl import PyOpenClArrayHandler from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
from pystencils.field import ( layout_string_to_tuple, spatial_layout_string_to_tuple)
Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
spatial_layout_string_to_tuple)
from pystencils.slicing import normalize_slice, remove_ghost_layers from pystencils.slicing import normalize_slice, remove_ghost_layers
from pystencils.utils import DotDict from pystencils.utils import DotDict
...@@ -22,10 +21,9 @@ class SerialDataHandling(DataHandling): ...@@ -22,10 +21,9 @@ class SerialDataHandling(DataHandling):
default_ghost_layers: int = 1, default_ghost_layers: int = 1,
default_layout: str = 'SoA', default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, periodicity: Union[bool, Sequence[bool]] = False,
default_target: str = 'cpu', default_target: Target = Target.CPU,
opencl_queue=None, array_handler=None,
opencl_ctx=None, device_number=None) -> None:
array_handler=None) -> None:
""" """
Creates a data handling for single node simulations. Creates a data handling for single node simulations.
...@@ -33,8 +31,17 @@ class SerialDataHandling(DataHandling): ...@@ -33,8 +31,17 @@ class SerialDataHandling(DataHandling):
domain_size: size of the spatial domain as tuple domain_size: size of the spatial domain as tuple
default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
default_layout: default layout used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method
default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
if not overwritten in add_array, and synchronization functions are for the GPU by default Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
False (non-periodic)
default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default
array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
the device with the largest amount of memory is used. If multiple devices have the same
amount of memory, the one with the lower number is used
""" """
super(SerialDataHandling, self).__init__() super(SerialDataHandling, self).__init__()
self._domainSize = tuple(domain_size) self._domainSize = tuple(domain_size)
...@@ -46,17 +53,17 @@ class SerialDataHandling(DataHandling): ...@@ -46,17 +53,17 @@ class SerialDataHandling(DataHandling):
self.custom_data_cpu = DotDict() self.custom_data_cpu = DotDict()
self.custom_data_gpu = DotDict() self.custom_data_gpu = DotDict()
self._custom_data_transfer_functions = {} self._custom_data_transfer_functions = {}
self._opencl_queue = opencl_queue
self._opencl_ctx = opencl_ctx
if not array_handler: if not array_handler:
try: try:
self.array_handler = PyCudaArrayHandler() if device_number is None:
except Exception: import cupy.cuda.runtime
self.array_handler = PyCudaNotAvailableHandler() if cupy.cuda.runtime.getDeviceCount() > 0:
device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
if default_target == 'opencl' or opencl_queue: key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
self.array_handler = PyOpenClArrayHandler(opencl_queue) self.array_handler = GPUArrayHandler(device_number)
except ImportError:
self.array_handler = GPUNotAvailableHandler()
else: else:
self.array_handler = array_handler self.array_handler = array_handler
...@@ -67,9 +74,13 @@ class SerialDataHandling(DataHandling): ...@@ -67,9 +74,13 @@ class SerialDataHandling(DataHandling):
self._periodicity = periodicity self._periodicity = periodicity
self._field_information = {} self._field_information = {}
self.default_target = default_target self._default_target = default_target
self._start_time = time.perf_counter() self._start_time = time.perf_counter()
@property
def default_target(self):
return self._default_target
@property @property
def dim(self): def dim(self):
return len(self._domainSize) return len(self._domainSize)
...@@ -107,7 +118,7 @@ class SerialDataHandling(DataHandling): ...@@ -107,7 +118,7 @@ class SerialDataHandling(DataHandling):
} }
if not hasattr(values_per_cell, '__len__'): if not hasattr(values_per_cell, '__len__'):
values_per_cell = (values_per_cell, ) values_per_cell = (values_per_cell,)
if len(values_per_cell) == 1 and values_per_cell[0] == 1: if len(values_per_cell) == 1 and values_per_cell[0] == 1:
values_per_cell = () values_per_cell = ()
...@@ -128,10 +139,14 @@ class SerialDataHandling(DataHandling): ...@@ -128,10 +139,14 @@ class SerialDataHandling(DataHandling):
else: else:
layout_tuple = spatial_layout_string_to_tuple(layout, self.dim) layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)
# cpu_arr is always created - since there is no create_pycuda_array_with_layout() # cpu_arr is always created - since there is no create_gpu_array_with_layout()
byte_offset = ghost_layers * np.dtype(dtype).itemsize byte_offset = ghost_layers * np.dtype(dtype).itemsize
cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
byte_offset=byte_offset, **kwargs) if gpu:
cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
else:
cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
byte_offset=byte_offset, **kwargs)
if alignment and gpu: if alignment and gpu:
raise NotImplementedError("Alignment for GPU fields not supported") raise NotImplementedError("Alignment for GPU fields not supported")
...@@ -253,30 +268,30 @@ class SerialDataHandling(DataHandling): ...@@ -253,30 +268,30 @@ class SerialDataHandling(DataHandling):
transfer_func = self._custom_data_transfer_functions[name][1] transfer_func = self._custom_data_transfer_functions[name][1]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else: else:
self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name]) if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
def to_gpu(self, name): def to_gpu(self, name):
if name in self._custom_data_transfer_functions: if name in self._custom_data_transfer_functions:
transfer_func = self._custom_data_transfer_functions[name][0] transfer_func = self._custom_data_transfer_functions[name][0]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else: else:
self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name]) if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
def is_on_gpu(self, name): def is_on_gpu(self, name):
return name in self.gpu_arrays return name in self.gpu_arrays
def synchronization_function_cpu(self, names, stencil_name=None, **_): def synchronization_function_cpu(self, names, stencil_name=None, **_):
return self.synchronization_function(names, stencil_name, target='cpu') return self.synchronization_function(names, stencil_name, target=Target.CPU)
def synchronization_function_gpu(self, names, stencil_name=None, **_): def synchronization_function_gpu(self, names, stencil_name=None, **_):
return self.synchronization_function(names, stencil_name, target='gpu') return self.synchronization_function(names, stencil_name, target=Target.GPU)
def synchronization_function(self, names, stencil=None, target=None, functor=None, **_): def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
if target is None: if target is None:
target = self.default_target target = self.default_target
if target == 'opencl': assert target in (Target.CPU, Target.GPU)
target = 'gpu'
assert target in ('cpu', 'gpu')
if not hasattr(names, '__len__') or type(names) is str: if not hasattr(names, '__len__') or type(names) is str:
names = [names] names = [names]
...@@ -305,30 +320,28 @@ class SerialDataHandling(DataHandling): ...@@ -305,30 +320,28 @@ class SerialDataHandling(DataHandling):
gls = self._field_information[name]['ghost_layers'] gls = self._field_information[name]['ghost_layers']
values_per_cell = self._field_information[name]['values_per_cell'] values_per_cell = self._field_information[name]['values_per_cell']
if values_per_cell == (): if values_per_cell == ():
values_per_cell = (1, ) values_per_cell = (1,)
if len(values_per_cell) == 1: if len(values_per_cell) == 1:
values_per_cell = values_per_cell[0] values_per_cell = values_per_cell[0]
if len(filtered_stencil) > 0: if len(filtered_stencil) > 0:
if target == 'cpu': if target == Target.CPU:
if functor is None: if functor is None:
from pystencils.slicing import get_periodic_boundary_functor from pystencils.slicing import get_periodic_boundary_functor
functor = get_periodic_boundary_functor functor = get_periodic_boundary_functor
result.append(functor(filtered_stencil, ghost_layers=gls)) result.append(functor(filtered_stencil, ghost_layers=gls))
else: else:
if functor is None: if functor is None:
from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
target = 'gpu' if not isinstance(self.array_handler, PyOpenClArrayHandler) else 'opencl' target = Target.GPU
result.append(functor(filtered_stencil, self._domainSize, result.append(functor(filtered_stencil, self._domainSize,
index_dimensions=self.fields[name].index_dimensions, index_dimensions=self.fields[name].index_dimensions,
index_dim_shape=values_per_cell, index_dim_shape=values_per_cell,
dtype=self.fields[name].dtype.numpy_dtype, dtype=self.fields[name].dtype.numpy_dtype,
ghost_layers=gls, ghost_layers=gls,
target=target, target=target))
opencl_queue=self._opencl_queue,
opencl_ctx=self._opencl_ctx))
if target == 'cpu': if target == Target.CPU:
def result_functor(): def result_functor():
for arr_name, func in zip(names, result): for arr_name, func in zip(names, result):
func(pdfs=self.cpu_arrays[arr_name]) func(pdfs=self.cpu_arrays[arr_name])
...@@ -379,6 +392,7 @@ class SerialDataHandling(DataHandling): ...@@ -379,6 +392,7 @@ class SerialDataHandling(DataHandling):
raise NotImplementedError("VTK export for fields with more than one index " raise NotImplementedError("VTK export for fields with more than one index "
"coordinate not implemented") "coordinate not implemented")
image_to_vtk(full_file_name, cell_data=cell_data) image_to_vtk(full_file_name, cell_data=cell_data)
return writer return writer
def create_vtk_writer_for_flag_array(self, file_name, data_name, masks_to_name, ghost_layers=False): def create_vtk_writer_for_flag_array(self, file_name, data_name, masks_to_name, ghost_layers=False):
...@@ -424,13 +438,19 @@ class SerialDataHandling(DataHandling): ...@@ -424,13 +438,19 @@ class SerialDataHandling(DataHandling):
def world_rank(self): def world_rank(self):
return 0 return 0
def save_all(self, file): def save_all(self, filename, compressed=True, synchronise_data=True):
np.savez_compressed(file, **self.cpu_arrays) if synchronise_data:
for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
self.to_cpu(name)
if compressed:
np.savez_compressed(filename, **self.cpu_arrays)
else:
np.savez(filename, **self.cpu_arrays)
def load_all(self, file): def load_all(self, filename, synchronise_data=True):
if '.npz' not in file: if '.npz' not in filename:
file += '.npz' filename += '.npz'
file_contents = np.load(file) file_contents = np.load(filename)
for arr_name, arr_contents in self.cpu_arrays.items(): for arr_name, arr_contents in self.cpu_arrays.items():
if arr_name not in file_contents: if arr_name not in file_contents:
print(f"Skipping read data {arr_name} because there is no data with this name in data handling") print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
...@@ -440,3 +460,6 @@ class SerialDataHandling(DataHandling): ...@@ -440,3 +460,6 @@ class SerialDataHandling(DataHandling):
f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}") f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
continue continue
np.copyto(arr_contents, file_contents[arr_name]) np.copyto(arr_contents, file_contents[arr_name])
if synchronise_data:
if arr_name in self.gpu_arrays.keys():
self.to_gpu(arr_name)
...@@ -3,13 +3,19 @@ from typing import Any, Dict, Optional, Union ...@@ -3,13 +3,19 @@ from typing import Any, Dict, Optional, Union
import sympy as sp import sympy as sp
from pystencils.astnodes import KernelFunction from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True): def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True):
"""Show a sympy or pystencils AST as dot graph""" """Show a sympy or pystencils AST as dot graph"""
from pystencils.astnodes import Node from pystencils.astnodes import Node
import graphviz try:
import graphviz
except ImportError:
print("graphviz is not installed. Visualizing the AST is not available")
return
graph_style = {} if graph_style is None else graph_style graph_style = {} if graph_style is None else graph_style
if isinstance(expr, Node): if isinstance(expr, Node):
...@@ -45,12 +51,9 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None) ...@@ -45,12 +51,9 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None)
if isinstance(ast, KernelWrapper): if isinstance(ast, KernelWrapper):
ast = ast.ast ast = ast.ast
if ast.backend == 'gpucuda': if ast.backend not in {Backend.C, Backend.CUDA}:
dialect = 'cuda' raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}')
elif ast.backend == 'opencl': dialect = ast.backend
dialect = 'opencl'
else:
dialect = 'c'
class CodeDisplay: class CodeDisplay:
def __init__(self, ast_input): def __init__(self, ast_input):
......
from enum import Enum, auto
class Target(Enum):
"""
The Target enumeration represents all possible targets that can be used for the code generation.
"""
CPU = auto()
"""
Target CPU architecture.
"""
GPU = auto()
"""
Target GPU architecture.
"""
class Backend(Enum):
"""
The Backend enumeration represents all possible backends that can be used for the code generation.
Backends and targets must be combined with care. For example CPU as a target and CUDA as a backend makes no sense.
"""
C = auto()
"""
Use the C Backend of pystencils.
"""
CUDA = auto()
"""
Use the CUDA backend to generate code for NVIDIA GPUs.
"""
...@@ -4,20 +4,30 @@ import sympy as sp ...@@ -4,20 +4,30 @@ import sympy as sp
from pystencils.astnodes import Node from pystencils.astnodes import Node
from pystencils.simp import AssignmentCollection from pystencils.simp import AssignmentCollection
from pystencils.assignment import Assignment
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_division(sp.Function): class fast_division(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (2,) nargs = (2,)
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_sqrt(sp.Function): class fast_sqrt(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (1, ) nargs = (1, )
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_inv_sqrt(sp.Function): class fast_inv_sqrt(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (1, ) nargs = (1, )
...@@ -32,7 +42,7 @@ def _run(term, visitor): ...@@ -32,7 +42,7 @@ def _run(term, visitor):
return visitor(term) return visitor(term)
def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]): def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
def visit(expr): def visit(expr):
if isinstance(expr, Node): if isinstance(expr, Node):
return expr return expr
...@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]) ...@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection])
return _run(term, visit) return _run(term, visit)
def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]): def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
def visit(expr): def visit(expr):
if isinstance(expr, Node): if isinstance(expr, Node):
......
...@@ -228,9 +228,10 @@ class FiniteDifferenceStaggeredStencilDerivation: ...@@ -228,9 +228,10 @@ class FiniteDifferenceStaggeredStencilDerivation:
neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative
dim: how many dimensions (2 or 3) dim: how many dimensions (2 or 3)
derivative: a tuple of directions over which to perform derivatives derivative: a tuple of directions over which to perform derivatives
free_weights_prefix: a string to prefix to free weight symbols. If None, do not return free weights
""" """
def __init__(self, neighbor, dim, derivative=tuple()): def __init__(self, neighbor, dim, derivative=tuple(), free_weights_prefix=None):
if type(neighbor) is str: if type(neighbor) is str:
neighbor = direction_string_to_offset(neighbor) neighbor = direction_string_to_offset(neighbor)
if dim == 2: if dim == 2:
...@@ -260,7 +261,7 @@ class FiniteDifferenceStaggeredStencilDerivation: ...@@ -260,7 +261,7 @@ class FiniteDifferenceStaggeredStencilDerivation:
main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]), main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]),
flipped(neighbor / -2, nonzero_indices[0])] flipped(neighbor / -2, nonzero_indices[0])]
else: else:
main_points = [neighbor.multiply_elementwise(sp.Matrix(c) / 2) main_points = [sp.Matrix(np.multiply(neighbor, sp.Matrix(c) / 2))
for c in itertools.product([-1, 1], repeat=3)] for c in itertools.product([-1, 1], repeat=3)]
points += main_points points += main_points
zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim] zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim]
...@@ -281,7 +282,10 @@ class FiniteDifferenceStaggeredStencilDerivation: ...@@ -281,7 +282,10 @@ class FiniteDifferenceStaggeredStencilDerivation:
# if the weights are underdefined, we can choose the free symbols to find the sparsest stencil # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
free_weights = set(itertools.chain(*[w.free_symbols for w in weights])) free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
if len(free_weights) > 0: if free_weights_prefix is not None:
weights = [w.subs({fw: sp.Symbol(f"{free_weights_prefix}_{i}") for i, fw in enumerate(free_weights)})
for w in weights]
elif len(free_weights) > 0:
zero_counts = defaultdict(list) zero_counts = defaultdict(list)
for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)], for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
repeat=len(free_weights)): repeat=len(free_weights)):
......