5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
-from typing import Union
 import sympy as sp
 import pystencils.astnodes as ast
-from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.config import CreateKernelConfig
 from pystencils.enums import Target, Backend
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
@@ -13,12 +10,13 @@ from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
 from pystencils.node_collection import NodeCollection
 from pystencils.transformations import (
-    filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering, make_loop_over_domain,
+    filtered_tree_iteration, iterate_loops_by_depth, get_base_buffer_index, get_optimal_loop_ordering,
+    make_loop_over_domain, add_outer_loop_over_indexed_elements,
    move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
    resolve_field_accesses, split_inner_loop)
-def create_kernel(assignments: Union[AssignmentCollection, NodeCollection],
+def create_kernel(assignments: NodeCollection,
                  config: CreateKernelConfig) -> KernelFunction:
    """Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
@@ -56,6 +54,8 @@ def create_kernel(assignments: Union[AssignmentCollection, NodeCollection],
    loop_order = get_optimal_loop_ordering(fields_without_buffers)
    loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice,
                                                        ghost_layers=ghost_layers, loop_order=loop_order)
+    loop_node = add_outer_loop_over_indexed_elements(loop_node)
    ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
                              ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)
@@ -76,7 +76,9 @@ def create_kernel(assignments: Union[AssignmentCollection, NodeCollection],
        typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
        split_inner_loop(ast_node, typed_split_groups)
-    base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']]
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
    base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
                                                             field.spatial_dimensions, field.index_dimensions)
                         for field in fields_without_buffers}
@@ -94,7 +96,7 @@ def create_kernel(assignments: Union[AssignmentCollection, NodeCollection],
    return ast_node
-def create_indexed_kernel(assignments: Union[AssignmentCollection, NodeCollection],
+def create_indexed_kernel(assignments: NodeCollection,
                          config: CreateKernelConfig) -> KernelFunction:
    """
    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
@@ -115,21 +117,24 @@ def create_indexed_kernel(assignments: Union[AssignmentCollection, NodeCollectio
    fields_written = assignments.bound_fields
    fields_read = assignments.rhs_fields
+    all_fields = fields_read.union(fields_written)
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
    assignments = assignments.all_assignments
    assignments = add_types(assignments, config)
-    all_fields = fields_read.union(fields_written)
    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
    def get_coordinate_symbol_assignment(name):
        for idx_field in index_fields:
            assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type"
@@ -211,3 +216,18 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, ass
        if collapse:
            prefix += f" collapse({collapse})"
        loop_to_parallelize.prefix_lines.append(prefix)
+def add_pragmas(ast_node, pragma_lines, nesting_depth=-1):
+    """Prepends given pragma lines to all loops of specified nesting depth.
+    Args:
+        ast_node: pystencils abstract syntax tree
+        pragma_lines: Iterable of strings containing the pragma lines
+        nesting_depth: Nesting depth of the loops the pragmas should be applied to.
+                       Outermost loop has depth 0.
+                       A depth of -1 indicates the innermost loops.
+    """
+    loop_nodes = iterate_loops_by_depth(ast_node, nesting_depth)
+    for n in loop_nodes:
+        n.prefix_lines += list(pragma_lines)
--- a/pystencils/cpu/msvc_detection.py
+++ b/pystencils/cpu/msvc_detection.py
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -123,12 +123,14 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
                                  "to differently typed floating point fields")
    float_size = field_float_dtypes.pop().numpy_dtype.itemsize
    assert float_size in (8, 4)
-    default_float_type = 'double' if float_size == 8 else 'float'
+    default_float_type = 'float64' if float_size == 8 else 'float32'
    vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
    kernel_ast.instruction_set = vector_is
+    if nontemporal and 'cachelineZero' in vector_is:
+        kernel_ast.use_all_written_field_sizes = True
    strided = 'storeS' in vector_is and 'loadS' in vector_is
-    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU']
+    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
                                                strided, keep_loop_stop, assume_sufficient_line_padding,
                                                default_float_type)
@@ -138,12 +140,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
                                                strided, keep_loop_stop, assume_sufficient_line_padding,
                                                default_float_type):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
-    vector_width = ast_node.instruction_set['width']
+    all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
-    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
+    vector_is = ast_node.instruction_set
+    assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = vector_is['width']
+    vector_int_width = vector_is['intwidth']
    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
@@ -156,6 +161,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
+            # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
            loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
                          if isinstance(loop, ast.LoopOverCoordinate)]
            assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
@@ -171,8 +177,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
+                if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
+                    # don't need to generate the alignment check when there are no aligned load/store instructions
+                    aligned_access = False
+                else:
+                    if not isinstance(vector_width, int):
+                        raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
+                                                  'vector ISAs')
+                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
-                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
@@ -201,7 +214,6 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
        loop_node.step = vector_width
        loop_node.subs(substitutions)
-        vector_int_width = ast_node.instruction_set['intwidth']
        arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
        arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
                         VectorType(loop_counter_symbol.dtype, vector_int_width))
@@ -220,7 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
            rng._symbols_defined = set(new_result_symbols)
        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
-        insert_vector_casts(loop_node, ast_node.instruction_set, default_float_type)
+        insert_vector_casts(loop_node, vector_is, default_float_type)
 def mask_conditionals(loop_body):
@@ -252,24 +264,47 @@ def mask_conditionals(loop_body):
 def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
    """Inserts necessary casts from scalar values to vector values."""
-    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.UnevaluatedExpr, sp.Abs)
+    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
+    def is_scalar(expr) -> bool:
+        if hasattr(expr, "dtype"):
+            if type(expr.dtype) is VectorType:
+                return False
+            # Else branch: If expr is a CastFunc, then whether the expression
+            # is scalar is determined by the argument (remember: vector casts
+            # are not inserted yet). Therefore, we must recurse into the args of
+            # expr below. Otherwise, this expression is atomic and in that case
+            # it is assumed to be scalar below.
+        if isinstance(expr, ast.ResolvedFieldAccess):
+            # expr.field is not in expr.args
+            return is_scalar(expr.field)
+        elif isinstance(expr, (vec_any, vec_all)):
+            return True
-    def visit_expr(expr, default_type='double'):  # TODO Vectorization Revamp: get rid of default_type
+        if not hasattr(expr, "args"):
+            return True
+        return all(is_scalar(arg) for arg in expr.args)
+    # TODO Vectorization Revamp: get rid of default_type
+    def visit_expr(expr, default_type='double', force_vectorize=False):
        if isinstance(expr, VectorMemoryAccess):
-            return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type), *expr.args[5:])
+            return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
+                                      *expr.args[5:])
        elif isinstance(expr, CastFunc):
            cast_type = expr.args[1]
-            arg = visit_expr(expr.args[0])
+            arg = visit_expr(expr.args[0], default_type, force_vectorize)
-            assert cast_type in [BasicType('float32'), BasicType('float64')],\
+            assert cast_type in [BasicType('float32'), BasicType('float64')], \
                f'Vectorization cannot vectorize type {cast_type}'
            return expr.func(arg, VectorType(cast_type, instruction_set['width']))
        elif expr.func is sp.Abs and 'abs' not in instruction_set:
-            new_arg = visit_expr(expr.args[0], default_type)
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
                else get_type_of_expression(expr.args[0])
            pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
                              (new_arg, True))
-            return visit_expr(pw, default_type)
+            return visit_expr(pw, default_type, force_vectorize)
        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
            if expr.func is sp.Mul and expr.args[0] == -1:
                # special treatment for the unary minus: make sure that the -1 has the same type as the argument
@@ -284,7 +319,7 @@ def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
                    if dtype is np.float32:
                        default_type = 'float'
                    expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
-            new_args = [visit_expr(a, default_type) for a in expr.args]
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
            if not any(type(t) is VectorType for t in arg_types):
                return expr
@@ -294,12 +329,32 @@ def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
                    CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
                    for a, t in zip(new_args, arg_types)]
                return expr.func(*casted_args)
+        elif expr.func is sp.UnevaluatedExpr:
+            assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
+            # TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
+            # TODO be fixed for real at some point.
+            if expr.args[0].is_Pow:
+                base = expr.args[0].base
+                exp = expr.args[0].exp
+                expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
+            target_type = collate_types(arg_types)
+            if not any(type(t) is VectorType for t in arg_types):
+                target_type = VectorType(target_type, instruction_set['width'])
+            casted_args = [
+                CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                for a, t in zip(new_args, arg_types)]
+            return expr.func(expr.args[0].func(*casted_args, evaluate=False))
        elif expr.func is sp.Pow:
-            new_arg = visit_expr(expr.args[0], default_type)
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
            return expr.func(new_arg, expr.args[1])
        elif expr.func == sp.Piecewise:
-            new_results = [visit_expr(a[0], default_type) for a in expr.args]
+            new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
-            new_conditions = [visit_expr(a[1], default_type) for a in expr.args]
+            new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
            types_of_results = [get_type_of_expression(a) for a in new_results]
            types_of_conditions = [get_type_of_expression(a) for a in new_conditions]
@@ -318,7 +373,14 @@ def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
                                 for a, t in zip(new_conditions, types_of_conditions)]
            return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
-        elif isinstance(expr, (sp.Number, TypedSymbol, BooleanAtom)):
+        elif isinstance(expr, TypedSymbol):
+            if force_vectorize:
+                expr_type = get_type_of_expression(expr)
+                if type(expr_type) is not VectorType:
+                    vector_type = VectorType(expr_type, instruction_set['width'])
+                    return CastFunc(expr, vector_type)
+            return expr
+        elif isinstance(expr, (sp.Number, BooleanAtom)):
            return expr
        else:
            raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
@@ -334,11 +396,18 @@ def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
                # continue
                subs_expr = fast_subs(assignment.rhs, substitution_dict,
                                      skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
-                assignment.rhs = visit_expr(subs_expr, default_type)
-                rhs_type = get_type_of_expression(assignment.rhs)
+                # If either side contains a vectorized subexpression, both sides
+                # must be fully vectorized.
+                lhs_scalar = is_scalar(assignment.lhs)
+                rhs_scalar = is_scalar(subs_expr)
+                assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
                if isinstance(assignment.lhs, TypedSymbol):
-                    lhs_type = assignment.lhs.dtype
+                    if lhs_scalar and not rhs_scalar:
-                    if type(rhs_type) is VectorType and type(lhs_type) is not VectorType:
+                        lhs_type = get_type_of_expression(assignment.lhs)
+                        rhs_type = get_type_of_expression(assignment.rhs)
                        new_lhs_type = VectorType(lhs_type, rhs_type.width)
                        new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
                        substitution_dict[assignment.lhs] = new_lhs

--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                         default_layout: str = 'SoA',
                         default_target: Target = Target.CPU,
                         parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
    """Creates a data handling instance.
    Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
        default_target: `Target`
        parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
        default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
    """
    if isinstance(default_target, str):
        new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                  periodicity=periodicity,
                                  default_target=default_target,
                                  default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)
 __all__ = ['create_data_handling']
--- a/pystencils/datahandling/blockiteration.py
+++ b/pystencils/datahandling/blockiteration.py
@@ -115,7 +115,7 @@ class ParallelBlock(Block):
            result = wlb.field.toArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
        elif 'GpuField' in type_name:
-            result = wlb.cuda.toGpuArray(result, with_ghost_layers=self._gls)
+            result = wlb.gpu.toGpuArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
        return result

--- a/pystencils/datahandling/datahandling_interface.py
+++ b/pystencils/datahandling/datahandling_interface.py
@@ -331,6 +331,7 @@ class DataHandling(ABC):
                b[array_name][(Ellipsis, *value_idx)].fill(val)
            else:
                b[array_name].fill(val)
+            self.to_gpu(array_name)
    def min(self, array_name, slice_obj=None, ghost_layers=False, inner_ghost_layers=False, reduce=True):
        """Returns the minimum value inside the domain or slice of the domain.

--- a/pystencils/datahandling/parallel_datahandling.py
+++ b/pystencils/datahandling/parallel_datahandling.py
@@ -151,8 +151,8 @@ class ParallelDataHandling(DataHandling):
        if gpu:
            if alignment != 0:
                raise ValueError("Alignment for walberla GPU fields not yet supported")
-            wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
+            wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
-                                          usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
+                                         usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
        if cpu and gpu:
            self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
@@ -255,7 +255,7 @@ class ParallelDataHandling(DataHandling):
    def get_kernel_kwargs(self, kernel_function, **kwargs):
        if kernel_function.ast.backend == Backend.CUDA:
            name_map = self._field_name_to_gpu_data_name
-            to_array = wlb.cuda.toGpuArray
+            to_array = wlb.gpu.toGpuArray
        else:
            name_map = self._field_name_to_cpu_data_name
            to_array = wlb.field.toArray
@@ -280,7 +280,8 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
@@ -288,20 +289,21 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def is_on_gpu(self, name):
        return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs
    def all_to_cpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_cpu(name)
    def all_to_gpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_gpu(name)
@@ -328,7 +330,7 @@ class ParallelDataHandling(DataHandling):
                create_packing = wlb.field.createStencilRestrictedPackInfo
        else:
            assert target == Target.GPU
-            create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo
+            create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
            names = [self.GPU_DATA_PREFIX + name for name in names]
        sync_function = create_scheme(self.blocks, stencil)

--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -6,11 +6,10 @@ import numpy as np
 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler
 from pystencils.enums import Target
-from pystencils.field import (
+from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
-    Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple,
+                              layout_string_to_tuple, spatial_layout_string_to_tuple)
-    spatial_layout_string_to_tuple)
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
 from pystencils.slicing import normalize_slice, remove_ghost_layers
 from pystencils.utils import DotDict
@@ -23,7 +22,8 @@ class SerialDataHandling(DataHandling):
                 default_layout: str = 'SoA',
                 periodicity: Union[bool, Sequence[bool]] = False,
                 default_target: Target = Target.CPU,
-                 array_handler=None) -> None:
+                 array_handler=None,
+                 device_number=None) -> None:
        """
        Creates a data handling for single node simulations.
@@ -31,9 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -48,9 +56,14 @@ class SerialDataHandling(DataHandling):
        if not array_handler:
            try:
-                self.array_handler = PyCudaArrayHandler()
+                if device_number is None:
-            except Exception:
+                    import cupy.cuda.runtime
-                self.array_handler = PyCudaNotAvailableHandler()
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
+                self.array_handler = GPUNotAvailableHandler()
        else:
            self.array_handler = array_handler
@@ -126,10 +139,14 @@ class SerialDataHandling(DataHandling):
        else:
            layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)
-        # cpu_arr is always created - since there is no create_pycuda_array_with_layout()
+        # cpu_arr is always created - since there is no create_gpu_array_with_layout()
        byte_offset = ghost_layers * np.dtype(dtype).itemsize
-        cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
-                                                 byte_offset=byte_offset, **kwargs)
+        if gpu:
+            cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
+        else:
+            cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
+                                                     byte_offset=byte_offset, **kwargs)
        if alignment and gpu:
            raise NotImplementedError("Alignment for GPU fields not supported")
@@ -251,14 +268,16 @@ class SerialDataHandling(DataHandling):
            transfer_func = self._custom_data_transfer_functions[name][1]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
            transfer_func = self._custom_data_transfer_functions[name][0]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
    def is_on_gpu(self, name):
        return name in self.gpu_arrays
@@ -313,7 +332,7 @@ class SerialDataHandling(DataHandling):
                    result.append(functor(filtered_stencil, ghost_layers=gls))
                else:
                    if functor is None:
-                        from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor
+                        from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
                        target = Target.GPU
                    result.append(functor(filtered_stencil, self._domainSize,
                                          index_dimensions=self.fields[name].index_dimensions,
@@ -419,13 +438,19 @@ class SerialDataHandling(DataHandling):
    def world_rank(self):
        return 0
-    def save_all(self, file):
+    def save_all(self, filename, compressed=True, synchronise_data=True):
-        np.savez_compressed(file, **self.cpu_arrays)
+        if synchronise_data:
+            for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
+                self.to_cpu(name)
+        if compressed:
+            np.savez_compressed(filename, **self.cpu_arrays)
+        else:
+            np.savez(filename, **self.cpu_arrays)
-    def load_all(self, file):
+    def load_all(self, filename, synchronise_data=True):
-        if '.npz' not in file:
+        if '.npz' not in filename:
-            file += '.npz'
+            filename += '.npz'
-        file_contents = np.load(file)
+        file_contents = np.load(filename)
        for arr_name, arr_contents in self.cpu_arrays.items():
            if arr_name not in file_contents:
                print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
@@ -435,3 +460,6 @@ class SerialDataHandling(DataHandling):
                      f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
                continue
            np.copyto(arr_contents, file_contents[arr_name])
+            if synchronise_data:
+                if arr_name in self.gpu_arrays.keys():
+                    self.to_gpu(arr_name)
--- a/pystencils/datahandling/vtk.py
+++ b/pystencils/datahandling/vtk.py
--- a/pystencils/display_utils.py
+++ b/pystencils/display_utils.py
--- a/pystencils/enums.py
+++ b/pystencils/enums.py
--- a/pystencils/fast_approximation.py
+++ b/pystencils/fast_approximation.py
--- a/pystencils/fd/__init__.py
+++ b/pystencils/fd/__init__.py
--- a/pystencils/fd/derivation.py
+++ b/pystencils/fd/derivation.py
--- a/pystencils/fd/derivative.py
+++ b/pystencils/fd/derivative.py
--- a/pystencils/fd/finitedifferences.py
+++ b/pystencils/fd/finitedifferences.py
--- a/pystencils/fd/finitevolumes.py
+++ b/pystencils/fd/finitevolumes.py
--- a/pystencils/fd/spatial.py
+++ b/pystencils/fd/spatial.py
--- a/pystencils/field.py
+++ b/pystencils/field.py
@@ -5,7 +5,7 @@ import pickle
 import re
 from enum import Enum
 from itertools import chain
-from typing import List, Optional, Sequence, Set, Tuple
+from typing import List, Optional, Sequence, Set, Tuple, Union
 import numpy as np
 import sympy as sp
@@ -69,74 +69,6 @@ class FieldType(Enum):
        return field.field_type == FieldType.STAGGERED_FLUX
-def fields(description=None, index_dimensions=0, layout=None, field_type=FieldType.GENERIC, **kwargs):
-    """Creates pystencils fields from a string description.
-    Examples:
-        Create a 2D scalar and vector field:
-            >>> s, v = fields("s, v(2): double[2D]")
-            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
-            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
-        Create an integer field of shape (10, 20):
-            >>> f = fields("f : int32[10, 20]")
-            >>> f.has_fixed_shape, f.shape
-            (True, (10, 20))
-        Numpy arrays can be used as template for shape and data type of field:
-            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
-            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
-            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
-            >>> assert v.index_shape == (2,)
-        Format string can be left out, field names are taken from keyword arguments.
-            >>> fields(f1=arr_s, f2=arr_s)
-            [f1: double[20,20], f2: double[20,20]]
-        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
-            >>> f = fields(f=arr_v, index_dimensions=1)
-            >>> assert f.index_dimensions == 1
-            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
-            >>> f.layout
-            (2, 1, 0)
-    """
-    result = []
-    if description:
-        field_descriptions, dtype, shape = _parse_description(description)
-        layout = 'numpy' if layout is None else layout
-        for field_name, idx_shape in field_descriptions:
-            if field_name in kwargs:
-                arr = kwargs[field_name]
-                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
-                assert idx_shape_of_arr == idx_shape
-                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
-                                                  field_type=field_type)
-            elif isinstance(shape, tuple):
-                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
-                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
-            elif isinstance(shape, int):
-                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout, field_type=field_type)
-            elif shape is None:
-                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout, field_type=field_type)
-            else:
-                assert False
-            result.append(f)
-    else:
-        assert layout is None, "Layout can not be specified when creating Field from numpy array"
-        for field_name, arr in kwargs.items():
-            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
-                                                        field_type=field_type))
-    if len(result) == 0:
-        return None
-    elif len(result) == 1:
-        return result[0]
-    else:
-        return result
 class Field:
    """
    With fields one can formulate stencil-like update rules on structured grids.
@@ -324,9 +256,7 @@ class Field:
        self.shape = shape
        self.strides = strides
        self.latex_name: Optional[str] = None
-        self.coordinate_origin: tuple[float, sp.Symbol] = sp.Matrix(tuple(
+        self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
-            0 for _ in range(self.spatial_dimensions)
-        ))
        self.coordinate_transform = sp.eye(self.spatial_dimensions)
        if field_type == FieldType.STAGGERED:
            assert self.staggered_stencil
@@ -335,8 +265,7 @@ class Field:
        if self.has_fixed_shape:
            return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
        else:
-            return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype,
+            return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
-                                        self.index_dimensions, self._layout, self.index_shape, self.field_type)
    @property
    def spatial_dimensions(self) -> int:
@@ -722,9 +651,11 @@ class Field:
                raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
            if len(idx) == 1 and isinstance(idx[0], str):
                dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
-                return Field.Access(self.field, self._offsets, idx, dtype=dtype)
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=dtype)
            else:
-                return Field.Access(self.field, self._offsets, idx, dtype=self.dtype)
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def __getitem__(self, *idx):
            return self.__call__(*idx)
@@ -778,7 +709,8 @@ class Field:
            """
            offset_list = list(self.offsets)
            offset_list[coord_id] += offset
-            return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype)
+            return Field.Access(self.field, tuple(offset_list), self.index,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def get_shifted(self, *shift) -> 'Field.Access':
            """Returns a new Access with changed spatial coordinates
@@ -791,6 +723,7 @@ class Field:
            return Field.Access(self.field,
                                tuple(a + b for a, b in zip(shift, self.offsets)),
                                self.index,
+                                is_absolute_access=self.is_absolute_access,
                                dtype=self.dtype)
        def at_index(self, *idx_tuple) -> 'Field.Access':
@@ -801,12 +734,14 @@ class Field:
                >>> f(0).at_index(8)
                f_C^8
            """
-            return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype)
+            return Field.Access(self.field, self.offsets, idx_tuple,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def _eval_subs(self, old, new):
            return Field.Access(self.field,
                                tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
                                tuple(sp.sympify(a).subs(old, new) for a in self.index),
+                                is_absolute_access=self.is_absolute_access,
                                dtype=self.dtype)
        @property
@@ -824,7 +759,8 @@ class Field:
        def _hashable_content(self):
            super_class_contents = super(Field.Access, self)._hashable_content()
-            return (super_class_contents, self._field.hashable_contents(), *self._index, *self._offsets)
+            return (super_class_contents, self._field.hashable_contents(), *self._index,
+                    *self._offsets, self._is_absolute_access)
        def _staggered_offset(self, offsets, index):
            assert FieldType.is_staggered(self._field)
@@ -875,6 +811,75 @@ class Field:
                    return f"{n}[{offset_str}]"
+def fields(description=None, index_dimensions=0, layout=None,
+           field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
+    """Creates pystencils fields from a string description.
+    Examples:
+        Create a 2D scalar and vector field:
+            >>> s, v = fields("s, v(2): double[2D]")
+            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
+            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
+        Create an integer field of shape (10, 20):
+            >>> f = fields("f : int32[10, 20]")
+            >>> f.has_fixed_shape, f.shape
+            (True, (10, 20))
+        Numpy arrays can be used as template for shape and data type of field:
+            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
+            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
+            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
+            >>> assert v.index_shape == (2,)
+        Format string can be left out, field names are taken from keyword arguments.
+            >>> fields(f1=arr_s, f2=arr_s)
+            [f1: double[20,20], f2: double[20,20]]
+        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
+            >>> f = fields(f=arr_v, index_dimensions=1)
+            >>> assert f.index_dimensions == 1
+            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
+            >>> f.layout
+            (2, 1, 0)
+    """
+    result = []
+    if description:
+        field_descriptions, dtype, shape = _parse_description(description)
+        layout = 'numpy' if layout is None else layout
+        for field_name, idx_shape in field_descriptions:
+            if field_name in kwargs:
+                arr = kwargs[field_name]
+                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
+                assert idx_shape_of_arr == idx_shape
+                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
+                                                  field_type=field_type)
+            elif isinstance(shape, tuple):
+                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
+                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
+            elif isinstance(shape, int):
+                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            elif shape is None:
+                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            else:
+                assert False
+            result.append(f)
+    else:
+        assert layout is None, "Layout can not be specified when creating Field from numpy array"
+        for field_name, arr in kwargs.items():
+            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
+                                                        field_type=field_type))
+    if len(result) == 0:
+        raise ValueError("Could not parse field description")
+    elif len(result) == 1:
+        return result[0]
+    else:
+        return result
 def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
    index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
    coordinates = list(range(len(strides)))
@@ -943,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
 def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
-    if layout_str in ('fzyx', 'zyxf'):
+    if dim <= 0:
-        assert dim <= 3
+        raise ValueError("Dimensionality must be positive")
-        return tuple(reversed(range(dim)))
+    layout_str = layout_str.lower()
-    if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'):
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
+        return tuple(reversed(range(dim)))
+    if layout_str in ('f', 'reverse_numpy'):
        return tuple(reversed(range(dim)))
-    elif layout_str in ('c', 'numpy', 'AoS'):
+    elif layout_str in ('c', 'numpy'):
        return tuple(range(dim))
    raise ValueError("Unknown layout descriptor " + layout_str)
 def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
    layout_str = layout_str.lower()
    if layout_str == 'fzyx' or layout_str == 'soa':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim)))
    elif layout_str == 'zyxf' or layout_str == 'aos':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim - 1))) + (dim - 1,)
    elif layout_str == 'f' or layout_str == 'reverse_numpy':
        return tuple(reversed(range(dim)))

--- a/pystencils/functions.py
+++ b/pystencils/functions.py
No results found