Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 977 additions and 314 deletions
...@@ -9,7 +9,7 @@ from pystencils.datahandling.blockiteration import block_iteration, sliced_block ...@@ -9,7 +9,7 @@ from pystencils.datahandling.blockiteration import block_iteration, sliced_block
from pystencils.datahandling.datahandling_interface import DataHandling from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.enums import Backend from pystencils.enums import Backend
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing.typed_sympy import FieldPointerSymbol
from pystencils.utils import DotDict from pystencils.utils import DotDict
from pystencils import Target from pystencils import Target
...@@ -151,8 +151,8 @@ class ParallelDataHandling(DataHandling): ...@@ -151,8 +151,8 @@ class ParallelDataHandling(DataHandling):
if gpu: if gpu:
if alignment != 0: if alignment != 0:
raise ValueError("Alignment for walberla GPU fields not yet supported") raise ValueError("Alignment for walberla GPU fields not yet supported")
wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell, wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout]) usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
if cpu and gpu: if cpu and gpu:
self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name)) self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
...@@ -255,7 +255,7 @@ class ParallelDataHandling(DataHandling): ...@@ -255,7 +255,7 @@ class ParallelDataHandling(DataHandling):
def get_kernel_kwargs(self, kernel_function, **kwargs): def get_kernel_kwargs(self, kernel_function, **kwargs):
if kernel_function.ast.backend == Backend.CUDA: if kernel_function.ast.backend == Backend.CUDA:
name_map = self._field_name_to_gpu_data_name name_map = self._field_name_to_gpu_data_name
to_array = wlb.cuda.toGpuArray to_array = wlb.gpu.toGpuArray
else: else:
name_map = self._field_name_to_cpu_data_name name_map = self._field_name_to_cpu_data_name
to_array = wlb.field.toArray to_array = wlb.field.toArray
...@@ -280,7 +280,8 @@ class ParallelDataHandling(DataHandling): ...@@ -280,7 +280,8 @@ class ParallelDataHandling(DataHandling):
for block in self.blocks: for block in self.blocks:
transfer_func(block[self.GPU_DATA_PREFIX + name], block[name]) transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
else: else:
wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name) if self.is_on_gpu(name):
wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
def to_gpu(self, name): def to_gpu(self, name):
if name in self._custom_data_transfer_functions: if name in self._custom_data_transfer_functions:
...@@ -288,20 +289,21 @@ class ParallelDataHandling(DataHandling): ...@@ -288,20 +289,21 @@ class ParallelDataHandling(DataHandling):
for block in self.blocks: for block in self.blocks:
transfer_func(block[self.GPU_DATA_PREFIX + name], block[name]) transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
else: else:
wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name) if self.is_on_gpu(name):
wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
def is_on_gpu(self, name): def is_on_gpu(self, name):
return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs
def all_to_cpu(self): def all_to_cpu(self):
for cpu_name, gpu_name in self._cpu_gpu_pairs: for cpu_name, gpu_name in self._cpu_gpu_pairs:
wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name) wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
for name in self._custom_data_transfer_functions.keys(): for name in self._custom_data_transfer_functions.keys():
self.to_cpu(name) self.to_cpu(name)
def all_to_gpu(self): def all_to_gpu(self):
for cpu_name, gpu_name in self._cpu_gpu_pairs: for cpu_name, gpu_name in self._cpu_gpu_pairs:
wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name) wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
for name in self._custom_data_transfer_functions.keys(): for name in self._custom_data_transfer_functions.keys():
self.to_gpu(name) self.to_gpu(name)
...@@ -328,7 +330,7 @@ class ParallelDataHandling(DataHandling): ...@@ -328,7 +330,7 @@ class ParallelDataHandling(DataHandling):
create_packing = wlb.field.createStencilRestrictedPackInfo create_packing = wlb.field.createStencilRestrictedPackInfo
else: else:
assert target == Target.GPU assert target == Target.GPU
create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
names = [self.GPU_DATA_PREFIX + name for name in names] names = [self.GPU_DATA_PREFIX + name for name in names]
sync_function = create_scheme(self.blocks, stencil) sync_function = create_scheme(self.blocks, stencil)
......
...@@ -6,12 +6,10 @@ import numpy as np ...@@ -6,12 +6,10 @@ import numpy as np
from pystencils.datahandling.blockiteration import SerialBlock from pystencils.datahandling.blockiteration import SerialBlock
from pystencils.datahandling.datahandling_interface import DataHandling from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler
from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
from pystencils.enums import Target from pystencils.enums import Target
from pystencils.field import ( from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, layout_string_to_tuple, spatial_layout_string_to_tuple)
spatial_layout_string_to_tuple) from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
from pystencils.slicing import normalize_slice, remove_ghost_layers from pystencils.slicing import normalize_slice, remove_ghost_layers
from pystencils.utils import DotDict from pystencils.utils import DotDict
...@@ -24,9 +22,8 @@ class SerialDataHandling(DataHandling): ...@@ -24,9 +22,8 @@ class SerialDataHandling(DataHandling):
default_layout: str = 'SoA', default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, periodicity: Union[bool, Sequence[bool]] = False,
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
opencl_queue=None, array_handler=None,
opencl_ctx=None, device_number=None) -> None:
array_handler=None) -> None:
""" """
Creates a data handling for single node simulations. Creates a data handling for single node simulations.
...@@ -34,9 +31,17 @@ class SerialDataHandling(DataHandling): ...@@ -34,9 +31,17 @@ class SerialDataHandling(DataHandling):
domain_size: size of the spatial domain as tuple domain_size: size of the spatial domain as tuple
default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
default_layout: default layout used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method
periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
False (non-periodic)
default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
allocated if not overwritten in add_array, and synchronization functions are for the GPU by allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default default
array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
the device with the largest amount of memory is used. If multiple devices have the same
amount of memory, the one with the lower number is used
""" """
super(SerialDataHandling, self).__init__() super(SerialDataHandling, self).__init__()
self._domainSize = tuple(domain_size) self._domainSize = tuple(domain_size)
...@@ -48,17 +53,17 @@ class SerialDataHandling(DataHandling): ...@@ -48,17 +53,17 @@ class SerialDataHandling(DataHandling):
self.custom_data_cpu = DotDict() self.custom_data_cpu = DotDict()
self.custom_data_gpu = DotDict() self.custom_data_gpu = DotDict()
self._custom_data_transfer_functions = {} self._custom_data_transfer_functions = {}
self._opencl_queue = opencl_queue
self._opencl_ctx = opencl_ctx
if not array_handler: if not array_handler:
try: try:
self.array_handler = PyCudaArrayHandler() if device_number is None:
except Exception: import cupy.cuda.runtime
self.array_handler = PyCudaNotAvailableHandler() if cupy.cuda.runtime.getDeviceCount() > 0:
device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
if default_target == Target.OPENCL or opencl_queue: key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
self.array_handler = PyOpenClArrayHandler(opencl_queue) self.array_handler = GPUArrayHandler(device_number)
except ImportError:
self.array_handler = GPUNotAvailableHandler()
else: else:
self.array_handler = array_handler self.array_handler = array_handler
...@@ -134,10 +139,14 @@ class SerialDataHandling(DataHandling): ...@@ -134,10 +139,14 @@ class SerialDataHandling(DataHandling):
else: else:
layout_tuple = spatial_layout_string_to_tuple(layout, self.dim) layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)
# cpu_arr is always created - since there is no create_pycuda_array_with_layout() # cpu_arr is always created - since there is no create_gpu_array_with_layout()
byte_offset = ghost_layers * np.dtype(dtype).itemsize byte_offset = ghost_layers * np.dtype(dtype).itemsize
cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
byte_offset=byte_offset, **kwargs) if gpu:
cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
else:
cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
byte_offset=byte_offset, **kwargs)
if alignment and gpu: if alignment and gpu:
raise NotImplementedError("Alignment for GPU fields not supported") raise NotImplementedError("Alignment for GPU fields not supported")
...@@ -259,14 +268,16 @@ class SerialDataHandling(DataHandling): ...@@ -259,14 +268,16 @@ class SerialDataHandling(DataHandling):
transfer_func = self._custom_data_transfer_functions[name][1] transfer_func = self._custom_data_transfer_functions[name][1]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else: else:
self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name]) if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
def to_gpu(self, name): def to_gpu(self, name):
if name in self._custom_data_transfer_functions: if name in self._custom_data_transfer_functions:
transfer_func = self._custom_data_transfer_functions[name][0] transfer_func = self._custom_data_transfer_functions[name][0]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else: else:
self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name]) if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
def is_on_gpu(self, name): def is_on_gpu(self, name):
return name in self.gpu_arrays return name in self.gpu_arrays
...@@ -280,8 +291,6 @@ class SerialDataHandling(DataHandling): ...@@ -280,8 +291,6 @@ class SerialDataHandling(DataHandling):
def synchronization_function(self, names, stencil=None, target=None, functor=None, **_): def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
if target is None: if target is None:
target = self.default_target target = self.default_target
if target == Target.OPENCL: # TODO potential misuse between Target and Backend
target = Target.GPU
assert target in (Target.CPU, Target.GPU) assert target in (Target.CPU, Target.GPU)
if not hasattr(names, '__len__') or type(names) is str: if not hasattr(names, '__len__') or type(names) is str:
names = [names] names = [names]
...@@ -323,17 +332,14 @@ class SerialDataHandling(DataHandling): ...@@ -323,17 +332,14 @@ class SerialDataHandling(DataHandling):
result.append(functor(filtered_stencil, ghost_layers=gls)) result.append(functor(filtered_stencil, ghost_layers=gls))
else: else:
if functor is None: if functor is None:
from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
target = Target.GPU if not isinstance(self.array_handler, target = Target.GPU
PyOpenClArrayHandler) else Target.OPENCL
result.append(functor(filtered_stencil, self._domainSize, result.append(functor(filtered_stencil, self._domainSize,
index_dimensions=self.fields[name].index_dimensions, index_dimensions=self.fields[name].index_dimensions,
index_dim_shape=values_per_cell, index_dim_shape=values_per_cell,
dtype=self.fields[name].dtype.numpy_dtype, dtype=self.fields[name].dtype.numpy_dtype,
ghost_layers=gls, ghost_layers=gls,
target=target, target=target))
opencl_queue=self._opencl_queue,
opencl_ctx=self._opencl_ctx))
if target == Target.CPU: if target == Target.CPU:
def result_functor(): def result_functor():
...@@ -432,13 +438,19 @@ class SerialDataHandling(DataHandling): ...@@ -432,13 +438,19 @@ class SerialDataHandling(DataHandling):
def world_rank(self): def world_rank(self):
return 0 return 0
def save_all(self, file): def save_all(self, filename, compressed=True, synchronise_data=True):
np.savez_compressed(file, **self.cpu_arrays) if synchronise_data:
for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
self.to_cpu(name)
if compressed:
np.savez_compressed(filename, **self.cpu_arrays)
else:
np.savez(filename, **self.cpu_arrays)
def load_all(self, file): def load_all(self, filename, synchronise_data=True):
if '.npz' not in file: if '.npz' not in filename:
file += '.npz' filename += '.npz'
file_contents = np.load(file) file_contents = np.load(filename)
for arr_name, arr_contents in self.cpu_arrays.items(): for arr_name, arr_contents in self.cpu_arrays.items():
if arr_name not in file_contents: if arr_name not in file_contents:
print(f"Skipping read data {arr_name} because there is no data with this name in data handling") print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
...@@ -448,3 +460,6 @@ class SerialDataHandling(DataHandling): ...@@ -448,3 +460,6 @@ class SerialDataHandling(DataHandling):
f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}") f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
continue continue
np.copyto(arr_contents, file_contents[arr_name]) np.copyto(arr_contents, file_contents[arr_name])
if synchronise_data:
if arr_name in self.gpu_arrays.keys():
self.to_gpu(arr_name)
...@@ -10,7 +10,12 @@ from pystencils.kernel_wrapper import KernelWrapper ...@@ -10,7 +10,12 @@ from pystencils.kernel_wrapper import KernelWrapper
def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True): def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True):
"""Show a sympy or pystencils AST as dot graph""" """Show a sympy or pystencils AST as dot graph"""
from pystencils.astnodes import Node from pystencils.astnodes import Node
import graphviz try:
import graphviz
except ImportError:
print("graphviz is not installed. Visualizing the AST is not available")
return
graph_style = {} if graph_style is None else graph_style graph_style = {} if graph_style is None else graph_style
if isinstance(expr, Node): if isinstance(expr, Node):
...@@ -46,7 +51,7 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None) ...@@ -46,7 +51,7 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None)
if isinstance(ast, KernelWrapper): if isinstance(ast, KernelWrapper):
ast = ast.ast ast = ast.ast
if ast.backend not in {Backend.C, Backend.CUDA, Backend.OPENCL}: if ast.backend not in {Backend.C, Backend.CUDA}:
raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}') raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}')
dialect = ast.backend dialect = ast.backend
......
from enum import Enum, auto
class Target(Enum):
"""
The Target enumeration represents all possible targets that can be used for the code generation.
"""
CPU = auto()
"""
Target CPU architecture.
"""
GPU = auto()
"""
Target GPU architecture.
"""
class Backend(Enum):
"""
The Backend enumeration represents all possible backends that can be used for the code generation.
Backends and targets must be combined with care. For example CPU as a target and CUDA as a backend makes no sense.
"""
C = auto()
"""
Use the C Backend of pystencils.
"""
CUDA = auto()
"""
Use the CUDA backend to generate code for NVIDIA GPUs.
"""
...@@ -4,20 +4,30 @@ import sympy as sp ...@@ -4,20 +4,30 @@ import sympy as sp
from pystencils.astnodes import Node from pystencils.astnodes import Node
from pystencils.simp import AssignmentCollection from pystencils.simp import AssignmentCollection
from pystencils.assignment import Assignment
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_division(sp.Function): class fast_division(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (2,) nargs = (2,)
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_sqrt(sp.Function): class fast_sqrt(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (1, ) nargs = (1, )
# noinspection PyPep8Naming # noinspection PyPep8Naming
class fast_inv_sqrt(sp.Function): class fast_inv_sqrt(sp.Function):
"""
Produces special float instructions for CUDA kernels
"""
nargs = (1, ) nargs = (1, )
...@@ -32,7 +42,7 @@ def _run(term, visitor): ...@@ -32,7 +42,7 @@ def _run(term, visitor):
return visitor(term) return visitor(term)
def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]): def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
def visit(expr): def visit(expr):
if isinstance(expr, Node): if isinstance(expr, Node):
return expr return expr
...@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]) ...@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection])
return _run(term, visit) return _run(term, visit)
def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]): def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
def visit(expr): def visit(expr):
if isinstance(expr, Node): if isinstance(expr, Node):
......
...@@ -105,9 +105,8 @@ class Discretization2ndOrder: ...@@ -105,9 +105,8 @@ class Discretization2ndOrder:
return self._discretize_advection(e) return self._discretize_advection(e)
elif isinstance(e, Diff): elif isinstance(e, Diff):
arg, *indices = diff_args(e) arg, *indices = diff_args(e)
from pystencils.interpolation_astnodes import InterpolatorAccess
if not isinstance(arg, (Field.Access, InterpolatorAccess)): if not isinstance(arg, Field.Access):
raise ValueError("Only derivatives with field or field accesses as arguments can be discretized") raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
return self.spatial_stencil(indices, self.dx, arg) return self.spatial_stencil(indices, self.dx, arg)
else: else:
......
from functools import lru_cache
from typing import Tuple from typing import Tuple
import sympy as sp import sympy as sp
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.cache import memorycache
from pystencils.fd import Diff from pystencils.fd import Diff
from pystencils.field import Field from pystencils.field import Field
from pystencils.transformations import generic_visit from pystencils.transformations import generic_visit
...@@ -136,7 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard): ...@@ -136,7 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
# -------------------------------------- special stencils -------------------------------------------------------------- # -------------------------------------- special stencils --------------------------------------------------------------
@memorycache(maxsize=1) @lru_cache(maxsize=1)
def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]: def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
# Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil # Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
# one weight has to be specifically set to a somewhat arbitrary value # one weight has to be specifically set to a somewhat arbitrary value
......
...@@ -5,7 +5,7 @@ import pickle ...@@ -5,7 +5,7 @@ import pickle
import re import re
from enum import Enum from enum import Enum
from itertools import chain from itertools import chain
from typing import List, Optional, Sequence, Set, Tuple from typing import List, Optional, Sequence, Set, Tuple, Union
import numpy as np import numpy as np
import sympy as sp import sympy as sp
...@@ -13,13 +13,13 @@ from sympy.core.cache import cacheit ...@@ -13,13 +13,13 @@ from sympy.core.cache import cacheit
import pystencils import pystencils
from pystencils.alignedarray import aligned_empty from pystencils.alignedarray import aligned_empty
from pystencils.data_types import StructType, TypedSymbol, create_type from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
from pystencils.stencil import ( from pystencils.stencil import (
direction_string_to_offset, inverse_direction, offset_to_direction_string) direction_string_to_offset, inverse_direction, offset_to_direction_string)
from pystencils.sympyextensions import is_integer_sequence from pystencils.sympyextensions import is_integer_sequence
__all__ = ['Field', 'fields', 'FieldType', 'AbstractField'] __all__ = ['Field', 'fields', 'FieldType', 'Field']
class FieldType(Enum): class FieldType(Enum):
...@@ -69,81 +69,7 @@ class FieldType(Enum): ...@@ -69,81 +69,7 @@ class FieldType(Enum):
return field.field_type == FieldType.STAGGERED_FLUX return field.field_type == FieldType.STAGGERED_FLUX
def fields(description=None, index_dimensions=0, layout=None, field_type=FieldType.GENERIC, **kwargs): class Field:
"""Creates pystencils fields from a string description.
Examples:
Create a 2D scalar and vector field:
>>> s, v = fields("s, v(2): double[2D]")
>>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
>>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
Create an integer field of shape (10, 20):
>>> f = fields("f : int32[10, 20]")
>>> f.has_fixed_shape, f.shape
(True, (10, 20))
Numpy arrays can be used as template for shape and data type of field:
>>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
>>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
>>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
>>> assert v.index_shape == (2,)
Format string can be left out, field names are taken from keyword arguments.
>>> fields(f1=arr_s, f2=arr_s)
[f1: double[20,20], f2: double[20,20]]
The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
>>> f = fields(f=arr_v, index_dimensions=1)
>>> assert f.index_dimensions == 1
>>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
>>> f.layout
(2, 1, 0)
"""
result = []
if description:
field_descriptions, dtype, shape = _parse_description(description)
layout = 'numpy' if layout is None else layout
for field_name, idx_shape in field_descriptions:
if field_name in kwargs:
arr = kwargs[field_name]
idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
assert idx_shape_of_arr == idx_shape
f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
field_type=field_type)
elif isinstance(shape, tuple):
f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
elif isinstance(shape, int):
f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
elif shape is None:
f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
else:
assert False
result.append(f)
else:
assert layout is None, "Layout can not be specified when creating Field from numpy array"
for field_name, arr in kwargs.items():
result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
field_type=field_type))
if len(result) == 0:
return None
elif len(result) == 1:
return result[0]
else:
return result
class AbstractField:
class AbstractAccess:
pass
class Field(AbstractField):
""" """
With fields one can formulate stencil-like update rules on structured grids. With fields one can formulate stencil-like update rules on structured grids.
This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array. This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
...@@ -330,9 +256,7 @@ class Field(AbstractField): ...@@ -330,9 +256,7 @@ class Field(AbstractField):
self.shape = shape self.shape = shape
self.strides = strides self.strides = strides
self.latex_name: Optional[str] = None self.latex_name: Optional[str] = None
self.coordinate_origin: tuple[float, sp.Symbol] = sp.Matrix(tuple( self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
0 for _ in range(self.spatial_dimensions)
))
self.coordinate_transform = sp.eye(self.spatial_dimensions) self.coordinate_transform = sp.eye(self.spatial_dimensions)
if field_type == FieldType.STAGGERED: if field_type == FieldType.STAGGERED:
assert self.staggered_stencil assert self.staggered_stencil
...@@ -341,8 +265,7 @@ class Field(AbstractField): ...@@ -341,8 +265,7 @@ class Field(AbstractField):
if self.has_fixed_shape: if self.has_fixed_shape:
return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides) return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
else: else:
return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype, return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
self.index_dimensions, self._layout, self.index_shape, self.field_type)
@property @property
def spatial_dimensions(self) -> int: def spatial_dimensions(self) -> int:
...@@ -432,8 +355,8 @@ class Field(AbstractField): ...@@ -432,8 +355,8 @@ class Field(AbstractField):
elif len(index_shape) == 2: elif len(index_shape) == 2:
return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])]) return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
elif len(index_shape) == 3: elif len(index_shape) == 3:
return sp.Matrix([[[self(i, j, k) for k in range(index_shape[2])] return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
for j in range(index_shape[1])] for i in range(index_shape[0])]) for j in range(index_shape[1])] for i in range(index_shape[0])])
else: else:
raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions") raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
...@@ -466,35 +389,13 @@ class Field(AbstractField): ...@@ -466,35 +389,13 @@ class Field(AbstractField):
if type(offset) is not tuple: if type(offset) is not tuple:
offset = (offset,) offset = (offset,)
if len(offset) != self.spatial_dimensions: if len(offset) != self.spatial_dimensions:
raise ValueError("Wrong number of spatial indices: " raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
"Got %d, expected %d" % (len(offset), self.spatial_dimensions))
return Field.Access(self, offset) return Field.Access(self, offset)
def absolute_access(self, offset, index): def absolute_access(self, offset, index):
assert FieldType.is_custom(self) assert FieldType.is_custom(self)
return Field.Access(self, offset, index, is_absolute_access=True) return Field.Access(self, offset, index, is_absolute_access=True)
def interpolated_access(self,
offset: Tuple,
interpolation_mode='linear',
address_mode='BORDER',
allow_textures=True):
"""Provides access to field values at non-integer positions
``interpolated_access`` is similar to :func:`Field.absolute_access` except that
it allows non-integer offsets and automatic handling of out-of-bound accesses.
:param offset: Tuple of spatial coordinates (can be floats)
:param interpolation_mode: One of :class:`pystencils.interpolation_astnodes.InterpolationMode`
:param address_mode: How boundaries are handled can be 'border', 'wrap', 'mirror', 'clamp'
:param allow_textures: Allow implementation by texture accesses on GPUs
"""
from pystencils.interpolation_astnodes import Interpolator
return Interpolator(self,
interpolation_mode,
address_mode,
allow_textures=allow_textures).at(offset)
def staggered_access(self, offset, index=None): def staggered_access(self, offset, index=None):
"""If this field is a staggered field, it can be accessed using half-integer offsets. """If this field is a staggered field, it can be accessed using half-integer offsets.
For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
...@@ -511,8 +412,7 @@ class Field(AbstractField): ...@@ -511,8 +412,7 @@ class Field(AbstractField):
offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions)) offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
offset = tuple([o * sp.Rational(1, 2) for o in offset]) offset = tuple([o * sp.Rational(1, 2) for o in offset])
if len(offset) != self.spatial_dimensions: if len(offset) != self.spatial_dimensions:
raise ValueError("Wrong number of spatial indices: " raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
"Got %d, expected %d" % (len(offset), self.spatial_dimensions))
prefactor = 1 prefactor = 1
neighbor_vec = [0] * len(offset) neighbor_vec = [0] * len(offset)
...@@ -526,8 +426,7 @@ class Field(AbstractField): ...@@ -526,8 +426,7 @@ class Field(AbstractField):
if FieldType.is_staggered_flux(self): if FieldType.is_staggered_flux(self):
prefactor = -1 prefactor = -1
if neighbor not in self.staggered_stencil: if neighbor not in self.staggered_stencil:
raise ValueError("{} is not a valid neighbor for the {} stencil".format(offset_orig, raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
self.staggered_stencil_name))
offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec)) offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
...@@ -539,15 +438,13 @@ class Field(AbstractField): ...@@ -539,15 +438,13 @@ class Field(AbstractField):
return prefactor * Field.Access(self, offset, (idx,)) return prefactor * Field.Access(self, offset, (idx,))
else: # this field stores a vector or tensor at each staggered position else: # this field stores a vector or tensor at each staggered position
if index is None: if index is None:
raise ValueError("Wrong number of indices: " raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
"Got %d, expected %d" % (0, self.index_dimensions - 1))
if type(index) is np.ndarray: if type(index) is np.ndarray:
index = tuple(index) index = tuple(index)
if type(index) is not tuple: if type(index) is not tuple:
index = (index,) index = (index,)
if self.index_dimensions != len(index) + 1: if self.index_dimensions != len(index) + 1:
raise ValueError("Wrong number of indices: " raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
"Got %d, expected %d" % (len(index), self.index_dimensions - 1))
return prefactor * Field.Access(self, offset, (idx, *index)) return prefactor * Field.Access(self, offset, (idx, *index))
...@@ -587,7 +484,7 @@ class Field(AbstractField): ...@@ -587,7 +484,7 @@ class Field(AbstractField):
@property @property
def staggered_stencil_name(self): def staggered_stencil_name(self):
assert FieldType.is_staggered(self) assert FieldType.is_staggered(self)
return "D%dQ%d" % (self.spatial_dimensions, self.index_shape[0] * 2 + 1) return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
center = tuple([0] * self.spatial_dimensions) center = tuple([0] * self.spatial_dimensions)
...@@ -651,7 +548,7 @@ class Field(AbstractField): ...@@ -651,7 +548,7 @@ class Field(AbstractField):
self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape]) self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
# noinspection PyAttributeOutsideInit,PyUnresolvedReferences # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
class Access(TypedSymbol, AbstractField.AbstractAccess): class Access(TypedSymbol):
"""Class representing a relative access into a `Field`. """Class representing a relative access into a `Field`.
This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
...@@ -705,7 +602,11 @@ class Field(AbstractField): ...@@ -705,7 +602,11 @@ class Field(AbstractField):
if superscript is not None: if superscript is not None:
symbol_name += "^" + superscript symbol_name += "^" + superscript
obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype) if dtype:
obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
else:
obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
obj._field = field obj._field = field
obj._offsets = [] obj._offsets = []
for o in offsets: for o in offsets:
...@@ -747,9 +648,14 @@ class Field(AbstractField): ...@@ -747,9 +648,14 @@ class Field(AbstractField):
idx = () idx = ()
if len(idx) != self.field.index_dimensions: if len(idx) != self.field.index_dimensions:
raise ValueError("Wrong number of indices: " raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
"Got %d, expected %d" % (len(idx), self.field.index_dimensions)) if len(idx) == 1 and isinstance(idx[0], str):
return Field.Access(self.field, self._offsets, idx, dtype=self.dtype) dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=dtype)
else:
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def __getitem__(self, *idx): def __getitem__(self, *idx):
return self.__call__(*idx) return self.__call__(*idx)
...@@ -803,7 +709,8 @@ class Field(AbstractField): ...@@ -803,7 +709,8 @@ class Field(AbstractField):
""" """
offset_list = list(self.offsets) offset_list = list(self.offsets)
offset_list[coord_id] += offset offset_list[coord_id] += offset
return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype) return Field.Access(self.field, tuple(offset_list), self.index,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def get_shifted(self, *shift) -> 'Field.Access': def get_shifted(self, *shift) -> 'Field.Access':
"""Returns a new Access with changed spatial coordinates """Returns a new Access with changed spatial coordinates
...@@ -816,6 +723,7 @@ class Field(AbstractField): ...@@ -816,6 +723,7 @@ class Field(AbstractField):
return Field.Access(self.field, return Field.Access(self.field,
tuple(a + b for a, b in zip(shift, self.offsets)), tuple(a + b for a, b in zip(shift, self.offsets)),
self.index, self.index,
is_absolute_access=self.is_absolute_access,
dtype=self.dtype) dtype=self.dtype)
def at_index(self, *idx_tuple) -> 'Field.Access': def at_index(self, *idx_tuple) -> 'Field.Access':
...@@ -826,12 +734,14 @@ class Field(AbstractField): ...@@ -826,12 +734,14 @@ class Field(AbstractField):
>>> f(0).at_index(8) >>> f(0).at_index(8)
f_C^8 f_C^8
""" """
return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype) return Field.Access(self.field, self.offsets, idx_tuple,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def _eval_subs(self, old, new): def _eval_subs(self, old, new):
return Field.Access(self.field, return Field.Access(self.field,
tuple(sp.sympify(a).subs(old, new) for a in self.offsets), tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
tuple(sp.sympify(a).subs(old, new) for a in self.index), tuple(sp.sympify(a).subs(old, new) for a in self.index),
is_absolute_access=self.is_absolute_access,
dtype=self.dtype) dtype=self.dtype)
@property @property
...@@ -849,7 +759,8 @@ class Field(AbstractField): ...@@ -849,7 +759,8 @@ class Field(AbstractField):
def _hashable_content(self): def _hashable_content(self):
super_class_contents = super(Field.Access, self)._hashable_content() super_class_contents = super(Field.Access, self)._hashable_content()
return (super_class_contents, self._field.hashable_contents(), *self._index, *self._offsets) return (super_class_contents, self._field.hashable_contents(), *self._index,
*self._offsets, self._is_absolute_access)
def _staggered_offset(self, offsets, index): def _staggered_offset(self, offsets, index):
assert FieldType.is_staggered(self._field) assert FieldType.is_staggered(self._field)
...@@ -870,15 +781,14 @@ class Field(AbstractField): ...@@ -870,15 +781,14 @@ class Field(AbstractField):
if FieldType.is_staggered(self._field): if FieldType.is_staggered(self._field):
if self.index and self.field.index_dimensions > 1: if self.index and self.field.index_dimensions > 1:
return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index[1:] return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
if len(self.index) > 2 else self.index[1])
else: else:
return "{{%s}_{%s}}" % (n, offset_str) return f"{{{n}}}_{{{offset_str}}}"
else: else:
if self.index and self.field.index_dimensions > 0: if self.index and self.field.index_dimensions > 0:
return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0]) return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
else: else:
return "{{%s}_{%s}}" % (n, offset_str) return f"{{{n}}}_{{{offset_str}}}"
def __str__(self): def __str__(self):
n = self._field.latex_name if self._field.latex_name else self._field.name n = self._field.latex_name if self._field.latex_name else self._field.name
...@@ -901,6 +811,75 @@ class Field(AbstractField): ...@@ -901,6 +811,75 @@ class Field(AbstractField):
return f"{n}[{offset_str}]" return f"{n}[{offset_str}]"
def fields(description=None, index_dimensions=0, layout=None,
field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
"""Creates pystencils fields from a string description.
Examples:
Create a 2D scalar and vector field:
>>> s, v = fields("s, v(2): double[2D]")
>>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
>>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
Create an integer field of shape (10, 20):
>>> f = fields("f : int32[10, 20]")
>>> f.has_fixed_shape, f.shape
(True, (10, 20))
Numpy arrays can be used as template for shape and data type of field:
>>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
>>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
>>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
>>> assert v.index_shape == (2,)
Format string can be left out, field names are taken from keyword arguments.
>>> fields(f1=arr_s, f2=arr_s)
[f1: double[20,20], f2: double[20,20]]
The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
>>> f = fields(f=arr_v, index_dimensions=1)
>>> assert f.index_dimensions == 1
>>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
>>> f.layout
(2, 1, 0)
"""
result = []
if description:
field_descriptions, dtype, shape = _parse_description(description)
layout = 'numpy' if layout is None else layout
for field_name, idx_shape in field_descriptions:
if field_name in kwargs:
arr = kwargs[field_name]
idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
assert idx_shape_of_arr == idx_shape
f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
field_type=field_type)
elif isinstance(shape, tuple):
f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
elif isinstance(shape, int):
f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
elif shape is None:
f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
else:
assert False
result.append(f)
else:
assert layout is None, "Layout can not be specified when creating Field from numpy array"
for field_name, arr in kwargs.items():
result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
field_type=field_type))
if len(result) == 0:
raise ValueError("Could not parse field description")
elif len(result) == 1:
return result[0]
else:
return result
def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None): def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
coordinates = list(range(len(strides))) coordinates = list(range(len(strides)))
...@@ -969,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0 ...@@ -969,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]: def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
if layout_str in ('fzyx', 'zyxf'): if dim <= 0:
assert dim <= 3 raise ValueError("Dimensionality must be positive")
return tuple(reversed(range(dim)))
layout_str = layout_str.lower()
if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'): if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
if dim > 3:
raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
elif layout_str in ('c', 'numpy', 'AoS'):
if layout_str in ('f', 'reverse_numpy'):
return tuple(reversed(range(dim)))
elif layout_str in ('c', 'numpy'):
return tuple(range(dim)) return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layout_str) raise ValueError("Unknown layout descriptor " + layout_str)
def layout_string_to_tuple(layout_str, dim): def layout_string_to_tuple(layout_str, dim):
if dim <= 0:
raise ValueError("Dimensionality must be positive")
layout_str = layout_str.lower() layout_str = layout_str.lower()
if layout_str == 'fzyx' or layout_str == 'soa': if layout_str == 'fzyx' or layout_str == 'soa':
assert dim <= 4 if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
elif layout_str == 'zyxf' or layout_str == 'aos': elif layout_str == 'zyxf' or layout_str == 'aos':
assert dim <= 4 if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim - 1))) + (dim - 1,) return tuple(reversed(range(dim - 1))) + (dim - 1,)
elif layout_str == 'f' or layout_str == 'reverse_numpy': elif layout_str == 'f' or layout_str == 'reverse_numpy':
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
...@@ -1060,7 +1050,6 @@ def _parse_part1(d): ...@@ -1060,7 +1050,6 @@ def _parse_part1(d):
def _parse_description(description): def _parse_description(description):
def parse_part2(d): def parse_part2(d):
result = type_description_regex.match(d) result = type_description_regex.match(d)
if result: if result:
......
import sympy as sp
from pystencils.typing import PointerType
class DivFunc(sp.Function):
"""
DivFunc represents a division operation, since sympy represents divisions with ^-1
"""
is_Atom = True
is_real = True
def __new__(cls, *args, **kwargs):
if len(args) != 2:
raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
divisor, dividend, *other_args = args
return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
def _eval_evalf(self, *args, **kwargs):
return self.divisor.evalf() / self.dividend.evalf()
@property
def divisor(self):
return self.args[0]
@property
def dividend(self):
return self.args[1]
class AddressOf(sp.Function):
"""
AddressOf is the '&' operation in C. It gets the address of a lvalue.
"""
is_Atom = True
def __new__(cls, arg):
obj = sp.Function.__new__(cls, arg)
return obj
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
@property
def dtype(self):
if hasattr(self.args[0], 'dtype'):
return PointerType(self.args[0].dtype, restrict=True)
else:
raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
from pystencils.gpucuda.cudajit import make_python_function from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
from pystencils.gpucuda.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel from pystencils.gpu.gpujit import make_python_function
from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
from .indexing import AbstractIndexing, BlockIndexing, LineIndexing from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
__all__ = ['create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function', __all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
'AbstractIndexing', 'BlockIndexing', 'LineIndexing'] 'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
try:
import cupy as cp
import cupyx as cpx
except ImportError:
cp = None
cpx = None
import numpy as np
class GPUArrayHandler:
def __init__(self, device_number):
self._device_number = device_number
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order)
def ones(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.ones(shape=shape, dtype=dtype, order=order)
def empty(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.empty(shape=shape, dtype=dtype, order=order)
def to_gpu(self, numpy_array):
swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b)
return gpu_array
else:
return cp.asarray(numpy_array)
def upload(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base)
else:
with cp.cuda.Device(self._device_number):
array.set(numpy_array)
def download(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get()
else:
with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get()
def randn(self, shape, dtype=np.float64):
with cp.cuda.Device(self._device_number):
return cp.random.randn(*shape, dtype=dtype)
@staticmethod
def pinned_numpy_array(layout, shape, dtype):
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
cur_layout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if cur_layout[i] != layout[i]:
index_to_swap_with = cur_layout.index(layout[i])
swaps.append((i, index_to_swap_with))
cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
assert tuple(cur_layout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
from_numpy = to_gpu
class GPUNotAvailableHandler:
def __getattribute__(self, name):
raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
def _get_index_swaps(array):
swaps = []
if array.base is not None and isinstance(array.base, np.ndarray):
for stride in array.base.strides:
index_base = array.base.strides.index(stride)
index_view = array.strides.index(stride)
if index_base != index_view and (index_view, index_base) not in swaps:
swaps.append((index_base, index_view))
return swaps
...@@ -2,13 +2,11 @@ import numpy as np ...@@ -2,13 +2,11 @@ import numpy as np
from pystencils.backends.cbackend import get_headers from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda from pystencils.backends.cuda_backend import generate_cuda
from pystencils.data_types import StructType from pystencils.typing import StructType
from pystencils.field import FieldType from pystencils.field import FieldType
from pystencils.gpucuda.texture_utils import ndarray_to_tex from pystencils.include import get_pystencils_include_path
from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
from pystencils.interpolation_astnodes import InterpolatorAccess, TextureCachedField
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing import BasicType, FieldPointerSymbol
USE_FAST_MATH = True USE_FAST_MATH = True
...@@ -23,57 +21,49 @@ def get_cubic_interpolation_include_paths(): ...@@ -23,57 +21,49 @@ def get_cubic_interpolation_include_paths():
def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None): def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
""" """
Creates a kernel function from an abstract syntax tree which Creates a kernel function from an abstract syntax tree which
was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` or :func:`pystencils.gpu.created_indexed_cuda_kernel`
Args: Args:
kernel_function_node: the abstract syntax tree kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor. returned kernel functor.
custom_backend: use own custom printer for code generation
Returns: Returns:
compiled kernel as Python function compiled kernel as Python function
""" """
import pycuda.autoinit # NOQA import cupy as cp
from pycuda.compiler import SourceModule
if argument_dict is None: if argument_dict is None:
argument_dict = {} argument_dict = {}
header_list = ['<cstdint>'] + list(get_headers(kernel_function_node)) headers = get_headers(kernel_function_node)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) if cp.cuda.runtime.is_hip:
headers.add('"gpu_defines.h"')
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<hip/hip_fp16.h>')
else:
headers.update({'"gpu_defines.h"', '<cstdint>'})
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<cuda_fp16.h>')
header_list = sorted(headers)
includes = "\n".join([f"#include {include_file}" for include_file in header_list])
code = includes + "\n" code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n" code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n" code += "#define RESTRICT __restrict__\n\n"
code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend)) code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
textures = set(d.interpolator for d in kernel_function_node.atoms(
InterpolatorAccess) if isinstance(d.interpolator, TextureCachedField))
nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] options = ["-w", "-std=c++11"]
if USE_FAST_MATH: if USE_FAST_MATH:
nvcc_options.append("-use_fast_math") options.append("-use_fast_math")
options.append("-I" + get_pystencils_include_path())
# Code for CubicInterpolationCUDA
from pystencils.interpolation_astnodes import InterpolationMode
from os.path import join, dirname, isdir
if any(t.interpolation_mode == InterpolationMode.CUBIC_SPLINE for t in textures):
assert isdir(join(dirname(__file__), ("CubicInterpolationCUDA", "code")),
"Submodule CubicInterpolationCUDA does not exist.\n"
+ "Clone https://github.com/theHamsta/CubicInterpolationCUDA into pystencils.gpucuda")
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
needed_dims = set(t.field.spatial_dimensions for t in textures
if t.interpolation_mode == InterpolationMode.CUBIC_SPLINE)
for i in needed_dims:
code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code
mod = SourceModule(code, options=nvcc_options, include_dirs=[
get_pystencils_include_path(), get_pycuda_include_path()])
func = mod.get_function(kernel_function_node.function_name)
func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
cache = {} cache = {}
...@@ -84,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -84,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
for k, v in kwargs.items())) for k, v in kwargs.items()))
try: try:
args, block_and_thread_numbers = cache[key] args, block_and_thread_numbers = cache[key]
func(*args, **block_and_thread_numbers) device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError: except KeyError:
full_arguments = argument_dict.copy() full_arguments = argument_dict.copy()
full_arguments.update(kwargs) full_arguments.update(kwargs)
...@@ -95,17 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -95,17 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects: args = tuple(_build_numpy_argument_list(parameters, full_arguments))
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures:
tex_ref = mod.get_texref(str(tex))
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments)
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers) device = set(a.device.id for a in args if type(a) is cp.ndarray)
# import pycuda.driver as cuda assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging:
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node ast = kernel_function_node
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
...@@ -124,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict): ...@@ -124,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict):
actual_type = array.dtype actual_type = array.dtype
expected_type = param.fields[0].dtype.numpy_dtype expected_type = param.fields[0].dtype.numpy_dtype
if expected_type != actual_type: if expected_type != actual_type:
raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." % raise ValueError(f"Data type mismatch for field {param.field_name}. "
(param.field_name, expected_type, actual_type)) f"Expected {expected_type} got {actual_type}.")
result.append(array) result.append(array)
elif param.is_field_stride: elif param.is_field_stride:
cast_to_dtype = param.symbol.dtype.numpy_dtype.type cast_to_dtype = param.symbol.dtype.numpy_dtype.type
...@@ -160,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -160,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict):
try: try:
field_arr = argument_dict[symbolic_field.name] field_arr = argument_dict[symbolic_field.name]
except KeyError: except KeyError:
raise KeyError("Missing field parameter for kernel call " + str(symbolic_field)) raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
if symbolic_field.has_fixed_shape: if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape) symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1] symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape: if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" % raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
(symbolic_field.name, str(field_arr.shape), str(symbolic_field.shape))) f"which does not match expected shape {str(symbolic_field.shape)}")
if symbolic_field.has_fixed_shape: if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides) symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1] symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides: if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" % raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
(symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides))) f"which does not match expected strides {str(symbolic_field_strides)}")
if FieldType.is_indexed(symbolic_field): if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions]) index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
...@@ -183,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -183,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions]) array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
if len(array_shapes) > 1: if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes)) raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 1: if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes)) raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 0: if len(index_arr_shapes) > 0:
return list(index_arr_shapes)[0] return list(index_arr_shapes)[0]
......
import abc import abc
from functools import partial from functools import partial
import math
from typing import List, Tuple
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional, SympyAssignment
from pystencils.data_types import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice
from pystencils.sympyextensions import is_integer_sequence, prod from pystencils.sympyextensions import is_integer_sequence, prod
...@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c ...@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c
class AbstractIndexing(abc.ABC): class AbstractIndexing(abc.ABC):
""" """
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with and computes the number of blocks and threads a kernel is started with.
a pystencils field, a slice to iterate over, and further optional parameters that must have default values. The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
""" """
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
for iter_space in iteration_space:
assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
f"not tuple of type {type(iter_space)}"
self._iteration_space = iteration_space
self._data_layout = data_layout
self._dim = len(iteration_space)
@property
def iteration_space(self):
"""Iteration space to loop over"""
return self._iteration_space
@property
def data_layout(self):
"""Data layout of the kernels arrays"""
return self._data_layout
@property
def dim(self):
"""Number of spatial dimensions"""
return self._dim
@property @property
@abc.abstractmethod @abc.abstractmethod
def coordinates(self): def coordinates(self):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices. """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
These symbolic indices can be obtained with the method `index_variables` """ These symbolic indices can be obtained with the method `index_variables` """
@property @property
def index_variables(self): def index_variables(self):
"""Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """ """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
@abc.abstractmethod
def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
"""Adds assignments for the loop counter symbols depending on the gpu threads.
Args:
loop_counter_symbols: typed symbols representing the loop counters
Returns:
assignments for the loop counters
"""
@abc.abstractmethod @abc.abstractmethod
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
"""Determine grid and block size for kernel call. """Determine grid and block size for kernel call.
...@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC): ...@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC):
class BlockIndexing(AbstractIndexing): class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks. """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
Args: Args:
field: pystencils field (common to all Indexing classes) iteration_space: list of slices to determine start, stop and the step size for each coordinate
iteration_slice: slice that defines rectangular subarea which is iterated over data_layout: tuple specifying loop order with innermost loop last.
This is the same format as returned by `Field.layout`.
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
""" """
def __init__(self, field, iteration_slice, def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: super(BlockIndexing, self).__init__(iteration_space, data_layout)
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if self._dim > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
if permute_block_size_dependent_on_layout: if permute_block_size_dependent_on_layout and self._dim < 4:
block_size = self.permute_block_size_according_to_layout(block_size, field.layout) block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits # Get device limits
import pycuda.driver as cuda import cupy as cp
# noinspection PyUnresolvedReferences # See https://github.com/cupy/cupy/issues/7676
import pycuda.autoinit # NOQA if cp.cuda.runtime.is_hip:
da = cuda.device_attribute maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
device = cuda.Context.get_device() else:
maximum_block_size = tuple(device.get_attribute(a) da = cp.cuda.Device(device_number).attributes
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property @property
def coordinates(self): def cuda_indices(self):
offsets = _get_start_from_slice(self._iterationSlice)
block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
coordinates = [block_index * bs + thread_idx + off indices = [block_index * bs + thread_idx
for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)] for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
return indices[:self._dim]
return coordinates[:self._dim] @property
def coordinates(self):
if self._dim < 4:
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
return coordinates[:self._dim]
else:
coordinates = list()
width = self._iteration_space[1].stop - self.iteration_space[1].start
coordinates.append(div_floor(self.cuda_indices[0], width))
coordinates.append(sp.Mod(self.cuda_indices[0], width))
coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
return coordinates
def get_loop_ctr_assignments(self, loop_counter_symbols):
return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None} numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = _get_widths(numeric_iteration_slice)
if len(widths) > 3:
widths = [widths[0] * widths[1], widths[2], widths[3]]
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
_get_end_from_slice(self._iterationSlice, arr_shape))]
widths = sp.Matrix(widths).subs(substitution_dict)
extend_bs = (1,) * (3 - len(self._block_size)) extend_bs = (1,) * (3 - len(self._block_size))
block_size = self._block_size + extend_bs block_size = self._block_size + extend_bs
if not self._compile_time_block_size: if not self._compile_time_block_size:
...@@ -147,7 +205,8 @@ class BlockIndexing(AbstractIndexing): ...@@ -147,7 +205,8 @@ class BlockIndexing(AbstractIndexing):
for i in range(len(widths)): for i in range(len(widths)):
factor = div_floor(prod(block_size[:i]), prod(adapted_block_size)) factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
block_size = tuple(adapted_block_size) + extend_bs extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
block_size = tuple(adapted_block_size) + extend_adapted_bs
block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size)) block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
...@@ -158,42 +217,59 @@ class BlockIndexing(AbstractIndexing): ...@@ -158,42 +217,59 @@ class BlockIndexing(AbstractIndexing):
def guard(self, kernel_content, arr_shape): def guard(self, kernel_content, arr_shape):
arr_shape = arr_shape[:self._dim] arr_shape = arr_shape[:self._dim]
conditions = [c < end if len(self._iteration_space) - 1 == len(arr_shape):
for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))] numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
else:
assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
for i, s in enumerate(numeric_iteration_slice):
if s.step and s.step != 1:
end[i] = div_ceil(s.stop - s.start, s.step) + s.start
if self._dim < 4:
conditions = [c < e for c, e in zip(self.coordinates, end)]
else:
end = [end[0] * end[1], end[2], end[3]]
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
conditions = [c < e for c, e in zip(coordinates, end)]
condition = conditions[0] condition = conditions[0]
for c in conditions[1:]: for c in conditions[1:]:
condition = sp.And(condition, c) condition = sp.And(condition, c)
return Block([Conditional(condition, kernel_content)]) return Block([Conditional(condition, kernel_content)])
@staticmethod def numeric_iteration_space(self, arr_shape):
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
"""Shrinks the block_size if there are too many registers used per multiprocessor.
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation. This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a pycuda function. They can be obtained by ``func.num_regs`` from a cupy function.
:returns smaller block_size if too many registers are used. Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
""" """
import pycuda.driver as cuda import cupy as cp
# noinspection PyUnresolvedReferences
import pycuda.autoinit # NOQA
da = cuda.device_attribute
if device is None:
device = cuda.Context.get_device()
available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
block = block_size # See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True: while True:
num_threads = 1 required_registers = math.prod(result) * required_registers_per_thread
for t in block: if required_registers <= max_registers_per_block:
num_threads *= t return result
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
else: else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e]) largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert block[largest_grid_entry_idx] >= 2 assert result[largest_list_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2 result[largest_list_entry_idx] //= 2
@staticmethod @staticmethod
def permute_block_size_according_to_layout(block_size, layout): def permute_block_size_according_to_layout(block_size, layout):
...@@ -222,42 +298,48 @@ class BlockIndexing(AbstractIndexing): ...@@ -222,42 +298,48 @@ class BlockIndexing(AbstractIndexing):
class LineIndexing(AbstractIndexing): class LineIndexing(AbstractIndexing):
""" """
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block. Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z} The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device). maximum amount of threads allowed in a GPU block (which depends on device).
Args:
iteration_space: list of slices to determine start, stop and the step size for each coordinate
data_layout: tuple to determine the fast and slow coordinates.
""" """
def __init__(self, field, iteration_slice): def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX super(LineIndexing, self).__init__(iteration_space, data_layout)
if field.spatial_dimensions > 4:
if len(iteration_space) > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = available_indices[:field.spatial_dimensions] @property
def cuda_indices(self):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX
coordinates = available_indices[:self.dim]
fastest_coordinate = field.layout[-1] fastest_coordinate = self.data_layout[-1]
coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0] coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
self._coordinates = coordinates return coordinates
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
@property @property
def coordinates(self): def coordinates(self):
return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))] return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
def call_parameters(self, arr_shape): def get_loop_ctr_assignments(self, loop_counter_symbols):
substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None} return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), def call_parameters(self, arr_shape):
_get_end_from_slice(self._iterationSlice, arr_shape))] numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = sp.Matrix(widths).subs(substitution_dict) widths = _get_widths(numeric_iteration_slice)
def get_shape_of_cuda_idx(cuda_idx): def get_shape_of_cuda_idx(cuda_idx):
if cuda_idx not in self._coordinates: if cuda_idx not in self.cuda_indices:
return 1 return 1
else: else:
idx = self._coordinates.index(cuda_idx) idx = self.cuda_indices.index(cuda_idx)
return widths[idx] return widths[idx]
return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]), return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
...@@ -272,30 +354,66 @@ class LineIndexing(AbstractIndexing): ...@@ -272,30 +354,66 @@ class LineIndexing(AbstractIndexing):
def symbolic_parameters(self): def symbolic_parameters(self):
return set() return set()
def numeric_iteration_space(self, arr_shape):
return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
# -------------------------------------- Helper functions -------------------------------------------------------------- # -------------------------------------- Helper functions --------------------------------------------------------------
def _get_start_from_slice(iteration_slice): def _get_numeric_iteration_slice(iteration_slice, arr_shape):
res = [] res = []
for slice_component in iteration_slice: for slice_component, shape in zip(iteration_slice, arr_shape):
if type(slice_component) is slice: result_slice = slice_component
res.append(slice_component.start if slice_component.start is not None else 0) if not isinstance(result_slice.start, int):
else: start = result_slice.start
assert isinstance(slice_component, int) assert len(start.free_symbols) == 1
res.append(slice_component) start = start.subs({symbol: shape for symbol in start.free_symbols})
result_slice = slice(start, result_slice.stop, result_slice.step)
if not isinstance(result_slice.stop, int):
stop = result_slice.stop
assert len(stop.free_symbols) == 1
stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
result_slice = slice(result_slice.start, stop, result_slice.step)
assert isinstance(result_slice.step, int)
res.append(result_slice)
return res return res
def _get_end_from_slice(iteration_slice, arr_shape): def _get_widths(iteration_slice):
iter_slice = normalize_slice(iteration_slice, arr_shape) widths = []
res = [] for iter_slice in iteration_slice:
for slice_component in iter_slice: step = iter_slice.step
if type(slice_component) is slice: assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
res.append(slice_component.stop) start = iter_slice.start
stop = iter_slice.stop
if step == 1:
if stop - start == 0:
widths.append(1)
else:
widths.append(stop - start)
else: else:
assert isinstance(slice_component, int) width = (stop - start) / step
res.append(slice_component + 1) if isinstance(width, int):
return res widths.append(width)
elif isinstance(width, float):
widths.append(math.ceil(width))
else:
widths.append(div_ceil(stop - start, step))
return widths
def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
loop_ctr_assignments = []
for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
if isinstance(iter_slice, slice) and iter_slice.step > 1:
offset = (iter_slice.step * iter_slice.start) - iter_slice.start
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
elif iter_slice.start == iter_slice.stop:
loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
else:
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
return loop_ctr_assignments
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
......
import numpy as np import sympy as sp
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.data_types import StructType, TypedSymbol from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol
from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend from pystencils.enums import Target, Backend
from pystencils.gpucuda.cudajit import make_python_function from pystencils.gpu.gpujit import make_python_function
from pystencils.gpucuda.indexing import BlockIndexing from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.slicing import normalize_slice
from pystencils.transformations import ( from pystencils.transformations import (
add_types, get_base_buffer_index, get_common_shape, implement_interpolations, get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
def create_cuda_kernel(assignments, def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
function_name="kernel",
type_info=None, function_name = config.function_name
indexing_creator=BlockIndexing, indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice=None, iteration_slice = config.iteration_slice
ghost_layers=None, ghost_layers = config.ghost_layers
skip_independence_check=False,
use_textures_for_interpolation=True): fields_written = assignments.bound_fields
assert assignments, "Assignments must not be empty!" fields_read = assignments.rhs_fields
fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check) assignments = assignments.all_assignments
assignments = add_types(assignments, config)
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
buffers = set([f for f in all_fields if FieldType.is_buffer(f) or FieldType.is_custom(f)]) buffers = set([f for f in all_fields if FieldType.is_buffer(f)])
fields_without_buffers = all_fields - buffers fields_without_buffers = all_fields - buffers
field_accesses = set() field_accesses = set()
num_buffer_accesses = 0 num_buffer_accesses = 0
indexed_elements = set()
for eq in assignments: for eq in assignments:
indexed_elements.update(eq.atoms(sp.Indexed))
field_accesses.update(eq.atoms(Field.Access)) field_accesses.update(eq.atoms(Field.Access))
field_accesses = {e for e in field_accesses if not e.is_absolute_access} field_accesses = {e for e in field_accesses if not e.is_absolute_access}
num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field)) num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
common_shape = get_common_shape(fields_without_buffers) # common shape and field to from the iteration space
common_field = get_common_field(fields_without_buffers)
common_shape = common_field.spatial_shape
if iteration_slice is None: if iteration_slice is None:
# determine iteration slice from ghost layers # determine iteration slice from ghost layers
...@@ -52,17 +63,28 @@ def create_cuda_kernel(assignments, ...@@ -52,17 +63,28 @@ def create_cuda_kernel(assignments,
iteration_slice.append(slice(ghost_layers[i][0], iteration_slice.append(slice(ghost_layers[i][0],
-ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice) iteration_space = normalize_slice(iteration_slice, common_shape)
coord_mapping = indexing.coordinates else:
iteration_space = normalize_slice(iteration_slice, common_shape)
iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
if len(indexed_elements) > 0:
common_indexed_element = get_common_indexed_element(indexed_elements)
index = common_indexed_element.indices[0].atoms(TypedSymbol)
assert len(index) == 1, "index expressions must only contain one symbol representing the index"
indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
data_layout=common_field.layout)
extended_ctrs = [index.pop(), *loop_counter_symbols]
loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
else:
indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
assignments = loop_counter_assignments + assignments
block = indexing.guard(Block(assignments), common_shape)
cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
for i, value in enumerate(coord_mapping)]
cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
assignments = cell_idx_assignments + assignments
block = Block(assignments)
block = indexing.guard(block, common_shape)
unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
ast = KernelFunction(block, ast = KernelFunction(block,
...@@ -74,19 +96,18 @@ def create_cuda_kernel(assignments, ...@@ -74,19 +96,18 @@ def create_cuda_kernel(assignments,
assignments=assignments) assignments=assignments)
ast.global_variables.update(indexing.index_variables) ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) base_pointer_spec = config.base_pointer_specification
if base_pointer_spec is None:
base_pointer_spec = [['spatialInner0']] base_pointer_spec = []
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
f.spatial_dimensions, f.index_dimensions) f.spatial_dimensions, f.index_dimensions)
for f in all_fields} for f in all_fields}
coord_mapping = {f.name: cell_idx_symbols for f in all_fields} coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
loop_strides = list(fields_without_buffers)[0].shape
if any(FieldType.is_buffer(f) for f in all_fields): if any(FieldType.is_buffer(f) for f in all_fields):
resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields) base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
field_to_fixed_coordinates=coord_mapping) field_to_fixed_coordinates=coord_mapping)
...@@ -105,34 +126,41 @@ def create_cuda_kernel(assignments, ...@@ -105,34 +126,41 @@ def create_cuda_kernel(assignments,
return ast return ast
def created_indexed_cuda_kernel(assignments, def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
index_fields,
function_name="kernel", index_fields = config.index_fields
type_info=None, function_name = config.function_name
coordinate_names=('x', 'y', 'z'), coordinate_names = config.coordinate_names
indexing_creator=BlockIndexing, indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
use_textures_for_interpolation=True): fields_written = assignments.bound_fields
fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False) fields_read = assignments.rhs_fields
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
# extract the index fields based on the name. The original index field might have been modified
index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
f"Non index fields are {non_index_fields}, spatial coordinates are " \
f"{spatial_coordinates}"
spatial_coordinates = list(spatial_coordinates)[0]
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
for index_field in index_fields: for index_field in index_fields:
index_field.field_type = FieldType.INDEXED index_field.field_type = FieldType.INDEXED
assert FieldType.is_indexed(index_field) assert FieldType.is_indexed(index_field)
assert index_field.spatial_dimensions == 1, "Index fields have to be 1D" assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
spatial_coordinates = list(spatial_coordinates)[0]
def get_coordinate_symbol_assignment(name): def get_coordinate_symbol_assignment(name):
for ind_f in index_fields: for ind_f in index_fields:
assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type" assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
data_type = ind_f.dtype data_type = ind_f.dtype
if data_type.has_element(name): if data_type.has_element(name):
rhs = ind_f[0](name) rhs = ind_f[0](name)
lhs = TypedSymbol(name, np.int64) lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs) return SympyAssignment(lhs, rhs)
raise ValueError(f"Index {name} not found in any of the passed index fields") raise ValueError(f"Index {name} not found in any of the passed index fields")
...@@ -141,17 +169,19 @@ def created_indexed_cuda_kernel(assignments, ...@@ -141,17 +169,19 @@ def created_indexed_cuda_kernel(assignments,
coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments] coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
idx_field = list(index_fields)[0] idx_field = list(index_fields)[0]
indexing = indexing_creator(field=idx_field,
iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape)) iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
idx_field.spatial_shape)
indexing = indexing_creator(iteration_space=iteration_space,
data_layout=idx_field.layout)
function_body = Block(coordinate_symbol_assignments + assignments) function_body = Block(coordinate_symbol_assignments + assignments)
function_body = indexing.guard(function_body, get_common_shape(index_fields)) function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function, ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
None, function_name, assignments=assignments) None, function_name, assignments=assignments)
ast.global_variables.update(indexing.index_variables) ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
coord_mapping = indexing.coordinates coord_mapping = indexing.coordinates
base_pointer_spec = [['spatialInner0']] base_pointer_spec = [['spatialInner0']]
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
......
import numpy as np import numpy as np
from itertools import product from itertools import product
import pystencils.gpucuda from pystencils import CreateKernelConfig, create_kernel
import pystencils.opencl from pystencils.gpu import make_python_function
from pystencils import Assignment, Field from pystencils import Assignment, Field
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
from pystencils.enums import Target from pystencils.enums import Target
from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
...@@ -27,24 +26,21 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in ...@@ -27,24 +26,21 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
eq = Assignment(f(*i), f[tuple(offset)](*i)) eq = Assignment(f(*i), f[tuple(offset)](*i))
update_eqs.append(eq) update_eqs.append(eq)
ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True) config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
ast = create_kernel(update_eqs, config=config)
return ast return ast
def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
thickness=None, dtype=float, target=Target.GPU, opencl_queue=None, opencl_ctx=None): thickness=None, dtype=np.float64, target=Target.GPU):
assert target in {Target.GPU, Target.OPENCL} assert target in {Target.GPU}
src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
kernels = [] kernels = []
for src_slice, dst_slice in src_dst_slice_tuples: for src_slice, dst_slice in src_dst_slice_tuples:
ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype) ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
if target == pystencils.Target.GPU: kernels.append(make_python_function(ast))
kernels.append(pystencils.gpucuda.make_python_function(ast))
else:
ast._target = pystencils.Target.OPENCL
ast._backend = pystencils.Backend.OPENCL
kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx))
def functor(pdfs, **_): def functor(pdfs, **_):
for kernel in kernels: for kernel in kernels:
......