Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (4)
Showing
with 168 additions and 1094 deletions
This diff is collapsed.
......@@ -228,8 +228,7 @@ class KernelFunction(Node):
@property
def fields_accessed(self) -> Set[Field]:
"""Set of Field instances: fields which are accessed inside this kernel function"""
from pystencils.interpolation_astnodes import InterpolatorAccess
return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess), self.atoms(InterpolatorAccess)))
return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess)))
@property
def fields_written(self) -> Set[Field]:
......
......@@ -444,7 +444,7 @@ class CustomSympyPrinter(CCodePrinter):
def _print_Pow(self, expr):
"""Don't use std::pow function, for small integer exponents, write as multiplication"""
if not expr.free_symbols:
return self._typed_number(expr.evalf(), get_type_of_expression(expr))
return self._typed_number(expr.evalf(), get_type_of_expression(expr.base))
if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})"
......
......@@ -4,7 +4,6 @@ from pystencils.astnodes import Node
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
lines = f.readlines()
......@@ -76,30 +75,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
super(CudaSympyPrinter, self).__init__()
self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
def _print_InterpolatorAccess(self, node):
dtype = node.interpolator.field.dtype.numpy_dtype
if type(node) == DiffInterpolatorAccess:
# cubicTex3D_1st_derivative_x(texture tex, float3 coord)
template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)" # noqa
elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
template = "cubicTex%iDSimple(%s, %s)"
else:
if dtype.itemsize > 4:
# Use PyCuda hack!
# https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
template = "fp_tex%iD(%s, %s)"
else:
template = "tex%iD(%s, %s)"
code = template % (
node.interpolator.field.spatial_dimensions,
str(node.interpolator),
# + 0.5 comes from Nvidia's staggered indexing
', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
)
return code
def _print_Function(self, expr):
if isinstance(expr, fast_division):
assert len(expr.args) == 2, f"__fdividef has two arguments, but {len(expr.args)} where given"
......
......@@ -11,9 +11,9 @@ from pystencils.cpu.cpujit import make_python_function
from pystencils.data_types import StructType, TypedSymbol, create_type
from pystencils.field import Field, FieldType
from pystencils.transformations import (
add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering,
implement_interpolations, make_loop_over_domain, move_constants_before_loop,
parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, split_inner_loop)
add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering, make_loop_over_domain,
move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
resolve_field_accesses, split_inner_loop)
AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]]
......@@ -73,7 +73,6 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
ghost_layers=ghost_layers, loop_order=loop_order)
ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)
implement_interpolations(body)
if split_groups:
typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
......@@ -146,8 +145,6 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
loop_body = Block([])
loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0])
implement_interpolations(loop_node)
for assignment in assignments:
loop_body.append(assignment)
......
......@@ -105,9 +105,8 @@ class Discretization2ndOrder:
return self._discretize_advection(e)
elif isinstance(e, Diff):
arg, *indices = diff_args(e)
from pystencils.interpolation_astnodes import InterpolatorAccess
if not isinstance(arg, (Field.Access, InterpolatorAccess)):
if not isinstance(arg, Field.Access):
raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
return self.spatial_stencil(indices, self.dx, arg)
else:
......
......@@ -4,9 +4,7 @@ from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda
from pystencils.data_types import StructType
from pystencils.field import FieldType
from pystencils.gpucuda.texture_utils import ndarray_to_tex
from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
from pystencils.interpolation_astnodes import InterpolatorAccess, TextureCachedField
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.kernelparameters import FieldPointerSymbol
......@@ -47,29 +45,11 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n"
code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
textures = set(d.interpolator for d in kernel_function_node.atoms(
InterpolatorAccess) if isinstance(d.interpolator, TextureCachedField))
nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
if USE_FAST_MATH:
nvcc_options.append("-use_fast_math")
# Code for CubicInterpolationCUDA
from pystencils.interpolation_astnodes import InterpolationMode
from os.path import join, dirname, isdir
if any(t.interpolation_mode == InterpolationMode.CUBIC_SPLINE for t in textures):
assert isdir(join(dirname(__file__), ("CubicInterpolationCUDA", "code")),
"Submodule CubicInterpolationCUDA does not exist.\n"
+ "Clone https://github.com/theHamsta/CubicInterpolationCUDA into pystencils.gpucuda")
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
needed_dims = set(t.field.spatial_dimensions for t in textures
if t.interpolation_mode == InterpolationMode.CUBIC_SPLINE)
for i in needed_dims:
code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code
mod = SourceModule(code, options=nvcc_options, include_dirs=[
get_pystencils_include_path(), get_pycuda_include_path()])
func = mod.get_function(kernel_function_node.function_name)
......@@ -95,12 +75,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects:
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures:
tex_ref = mod.get_texref(str(tex))
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments)
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
......
......@@ -7,8 +7,8 @@ from pystencils.enums import Target, Backend
from pystencils.gpucuda.cudajit import make_python_function
from pystencils.gpucuda.indexing import BlockIndexing
from pystencils.transformations import (
add_types, get_base_buffer_index, get_common_shape, implement_interpolations,
parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
add_types, get_base_buffer_index, get_common_shape, parse_base_pointer_info,
resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
def create_cuda_kernel(assignments,
......@@ -17,8 +17,7 @@ def create_cuda_kernel(assignments,
indexing_creator=BlockIndexing,
iteration_slice=None,
ghost_layers=None,
skip_independence_check=False,
use_textures_for_interpolation=True):
skip_independence_check=False):
assert assignments, "Assignments must not be empty!"
fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
all_fields = fields_read.union(fields_written)
......@@ -74,8 +73,6 @@ def create_cuda_kernel(assignments,
assignments=assignments)
ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
base_pointer_spec = [['spatialInner0']]
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
f.spatial_dimensions, f.index_dimensions)
......@@ -110,8 +107,7 @@ def created_indexed_cuda_kernel(assignments,
function_name="kernel",
type_info=None,
coordinate_names=('x', 'y', 'z'),
indexing_creator=BlockIndexing,
use_textures_for_interpolation=True):
indexing_creator=BlockIndexing):
fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written])
......@@ -150,8 +146,6 @@ def created_indexed_cuda_kernel(assignments,
None, function_name, assignments=assignments)
ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
coord_mapping = indexing.coordinates
base_pointer_spec = [['spatialInner0']]
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import hashlib
import itertools
from enum import Enum
from typing import Set
import sympy as sp
from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Node
from pystencils.data_types import TypedSymbol, cast_func, create_type
try:
import pycuda.driver
except Exception:
pass
_hash = hashlib.md5
class InterpolationMode(str, Enum):
NEAREST_NEIGHBOR = "nearest_neighbour"
NN = NEAREST_NEIGHBOR
LINEAR = "linear"
CUBIC_SPLINE = "cubic_spline"
class _InterpolationSymbol(TypedSymbol):
def __new__(cls, name, field, interpolator):
obj = cls.__xnew_cached_(cls, name, field, interpolator)
return obj
def __new_stage2__(cls, name, field, interpolator):
obj = super().__xnew__(cls, name, 'dummy_symbol_carrying_field' + field.name)
obj.field = field
obj.interpolator = interpolator
return obj
def __getnewargs__(self):
return self.name, self.field, self.interpolator
def __getnewargs_ex__(self):
return (self.name, self.field, self.interpolator), {}
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
class Interpolator(object):
"""
Implements non-integer accesses on fields using linear interpolation.
On GPU, this interpolator can be implemented by a :class:`.TextureCachedField` for hardware acceleration.
Address modes are different boundary handlings possible choices are like for CUDA textures
**CLAMP**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
**BORDER**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
Now, to describe the last two address modes, we are forced to consider normalized coordinates,
so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
**WRAP**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
**MIRROR**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
"""
required_global_declarations = []
def __init__(self,
parent_field,
interpolation_mode: InterpolationMode,
address_mode='BORDER',
use_normalized_coordinates=False,
allow_textures=True):
super().__init__()
self.field = parent_field
self.field.field_type = pystencils.field.FieldType.CUSTOM
self.address_mode = address_mode
self.use_normalized_coordinates = use_normalized_coordinates
self.interpolation_mode = interpolation_mode
self.hash_str = hashlib.md5(
f'{self.field}_{address_mode}_{self.field.dtype}_{interpolation_mode}'.encode()).hexdigest()
self.symbol = _InterpolationSymbol(str(self), parent_field, self)
self.allow_textures = allow_textures
@property
def ndim(self):
return self.field.ndim
@property
def _hashable_contents(self):
return (str(self.address_mode),
str(type(self)),
self.hash_str,
self.use_normalized_coordinates)
def at(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __getitem__(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __str__(self):
return f'{self.field.name}_interpolator_{self.reproducible_hash}'
def __repr__(self):
return self.__str__()
def __hash__(self):
return hash(self._hashable_contents)
def __eq__(self, other):
return hash(self) == hash(other)
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class LinearInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.LINEAR,
address_mode,
use_normalized_coordinates)
class NearestNeightborInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.NN,
address_mode,
use_normalized_coordinates)
class InterpolatorAccess(TypedSymbol):
def __new__(cls, field, *offsets):
obj = InterpolatorAccess.__xnew_cached_(cls, field, *offsets)
return obj
def __new_stage2__(cls, symbol, *offsets):
assert offsets is not None
obj = super().__xnew__(cls, '%s_interpolator_%s' %
(symbol.field.name, _hash(str(tuple(offsets)).encode()).hexdigest()),
symbol.field.dtype)
obj.offsets = offsets
obj.symbol = symbol
obj.field = symbol.field
obj.interpolator = symbol.interpolator
return obj
def _hashable_contents(self):
return super()._hashable_content() + ((self.symbol, self.field, tuple(self.offsets), self.symbol.interpolator))
def __str__(self):
return f"{self.field.name}_interpolator({', '.join(str(o) for o in self.offsets)})"
def __repr__(self):
return self.__str__()
def _latex(self, printer, *_):
n = self.field.latex_name if self.field.latex_name else self.field.name
foo = ", ".join(str(printer.doprint(o)) for o in self.offsets)
return f'{n}_{{interpolator}}\\left({foo}\\right)'
@property
def ndim(self):
return len(self.offsets)
@property
def is_texture(self):
return isinstance(self.interpolator, TextureCachedField)
def atoms(self, *types):
if self.offsets:
offsets = set(o for o in self.offsets if isinstance(o, types))
if isinstance(self, *types):
offsets.update([self])
for o in self.offsets:
if hasattr(o, 'atoms'):
offsets.update(set(o.atoms(*types)))
return offsets
else:
return set()
def neighbor(self, coord_id, offset):
offset_list = list(self.offsets)
offset_list[coord_id] += offset
return self.interpolator.at(tuple(offset_list))
@property
def free_symbols(self):
symbols = set()
if self.offsets is not None:
for o in self.offsets:
if hasattr(o, 'free_symbols'):
symbols.update(set(o.free_symbols))
# if hasattr(o, 'atoms'):
# symbols.update(set(o.atoms(sp.Symbol)))
return symbols
@property
def required_global_declarations(self):
required_global_declarations = self.symbol.interpolator.required_global_declarations
if required_global_declarations:
required_global_declarations[0]._symbols_defined.add(self)
return required_global_declarations
@property
def args(self):
return [self.symbol, *self.offsets]
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return {self}
@property
def interpolation_mode(self):
return self.interpolator.interpolation_mode
@property
def _diff_interpolation_vec(self):
return sp.Matrix([DiffInterpolatorAccess(self.symbol, i, *self.offsets)
for i in range(len(self.offsets))])
def diff(self, *symbols, **kwargs):
if symbols == (self,):
return 1
rtn = self._diff_interpolation_vec.T * sp.Matrix(self.offsets).diff(*symbols, **kwargs)
if rtn.shape == (1, 1):
rtn = rtn[0, 0]
return rtn
def implementation_with_stencils(self):
field = self.field
default_int_type = create_type('int64')
use_textures = isinstance(self.interpolator, TextureCachedField)
if use_textures:
def absolute_access(x, _):
return self.symbol.interpolator.at((o for o in x))
else:
absolute_access = field.absolute_access
sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
offsets = self.offsets
rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
if self.interpolation_mode == InterpolationMode.NN:
if use_textures:
sum[channel_idx] = self
else:
sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
elif self.interpolation_mode == InterpolationMode.LINEAR:
# TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
index = [f(offset) for (f, offset) in zip(c, offsets)]
# Hardware boundary handling on GPU
if use_textures:
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
sum[channel_idx] += \
weight * absolute_access(index, channel_idx if field.index_dimensions else ())
# else boundary handling using software
elif str(self.interpolator.address_mode).lower() == 'border':
is_inside_field = sp.And(
*itertools.chain([i >= 0 for i in index],
[idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
index = [cast_func(i, default_int_type) for i in index]
sum[channel_idx] += sp.Piecewise(
(weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
is_inside_field),
(sp.simplify(0), True)
)
elif str(self.interpolator.address_mode).lower() == 'clamp':
index = [sp.Min(sp.Max(0, cast_func(i, default_int_type)), field.spatial_shape[dim] - 1)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
elif str(self.interpolator.address_mode).lower() == 'wrap':
index = [sp.Mod(cast_func(i, default_int_type), field.shape[dim] - 1)
for (dim, i) in enumerate(index)]
index = [cast_func(sp.Piecewise((i, i > 0),
(sp.Abs(cast_func(field.shape[dim] - 1 + i, default_int_type)),
True)), default_int_type)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
# sum[channel_idx] = 0
elif str(self.interpolator.address_mode).lower() == 'mirror':
def triangle_fun(x, half_period):
saw_tooth = cast_func(sp.Abs(cast_func(x, 'int32')), 'int32') % (
cast_func(2 * half_period, create_type('int32')))
return sp.Piecewise((saw_tooth, saw_tooth < half_period),
(2 * half_period - 1 - saw_tooth, True))
index = [cast_func(triangle_fun(i, field.shape[dim]),
default_int_type) for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
else:
raise NotImplementedError()
elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
raise NotImplementedError("only works with HW interpolation for float32")
sum = [sp.factor(s) for s in sum]
if field.index_dimensions:
return sp.Matrix(sum)
else:
return sum[0]
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __getnewargs__(self):
return (self.symbol, *self.offsets)
def __getnewargs_ex__(self):
return (self.symbol, *self.offsets), {}
class DiffInterpolatorAccess(InterpolatorAccess):
def __new__(cls, symbol, diff_coordinate_idx, *offsets):
if symbol.interpolator.interpolation_mode == InterpolationMode.LINEAR:
from pystencils.fd import Diff, Discretization2ndOrder
return Discretization2ndOrder(1)(Diff(symbol.interpolator.at(offsets), diff_coordinate_idx))
obj = DiffInterpolatorAccess.__xnew_cached_(cls, symbol, diff_coordinate_idx, *offsets)
return obj
def __new_stage2__(self, symbol: sp.Symbol, diff_coordinate_idx, *offsets):
assert offsets is not None
obj = super().__xnew__(self, symbol, *offsets)
obj.diff_coordinate_idx = diff_coordinate_idx
return obj
def __hash__(self):
return hash((self.symbol, self.field, self.diff_coordinate_idx, tuple(self.offsets), self.interpolator))
def __str__(self):
return '%s_diff%i_interpolator(%s)' % (self.field.name, self.diff_coordinate_idx,
', '.join(str(o) for o in self.offsets))
def __repr__(self):
return str(self)
@property
def args(self):
return [self.symbol, self.diff_coordinate_idx, *self.offsets]
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return {self}
@property
def interpolation_mode(self):
return self.interpolator.interpolation_mode
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __getnewargs__(self):
return (self.symbol, self.diff_coordinate_idx, *self.offsets)
def __getnewargs_ex__(self):
return (self.symbol, self.diff_coordinate_idx, *self.offsets), {}
##########################################################################################
# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
##########################################################################################
class TextureCachedField(Interpolator):
def __init__(self, parent_field,
address_mode=None,
filter_mode=None,
interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
use_normalized_coordinates=False,
read_as_integer=False
):
super().__init__(parent_field, interpolation_mode, address_mode, use_normalized_coordinates)
if address_mode is None:
address_mode = 'border'
if filter_mode is None:
filter_mode = pycuda.driver.filter_mode.LINEAR
self.read_as_integer = read_as_integer
self.required_global_declarations = [TextureDeclaration(self)]
@property
def ndim(self):
return self.field.ndim
@classmethod
def from_interpolator(cls, interpolator: LinearInterpolator):
if (isinstance(interpolator, cls)
or (hasattr(interpolator, 'allow_textures') and not interpolator.allow_textures)):
return interpolator
obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
return obj
def __str__(self):
return f'{self.field.name}_texture_{self.reproducible_hash}'
def __repr__(self):
return self.__str__()
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class TextureDeclaration(Node):
"""
A global declaration of a texture. Visible both for device and host code.
.. code:: cpp
// This Node represents the following global declaration
texture<float, cudaTextureType2D, cudaReadModeElementType> x_texture_5acc9fced7b0dc3e;
__device__ kernel(...) {
// kernel acceses x_texture_5acc9fced7b0dc3e with tex2d(...)
}
__host__ launch_kernel(...) {
// Host needs to bind the texture
cudaBindTexture(0, x_texture_5acc9fced7b0dc3e, buffer, N*sizeof(float));
}
This has been deprecated by CUDA in favor of :class:`.TextureObject`.
But texture objects are not yet supported by PyCUDA (https://github.com/inducer/pycuda/pull/174)
"""
def __init__(self, parent_texture):
self.texture = parent_texture
self._symbols_defined = {self.texture.symbol}
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return self._symbols_defined
@property
def args(self) -> Set[sp.Symbol]:
return set()
@property
def headers(self):
headers = ['"pycuda-helpers.hpp"']
if self.texture.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
headers.append('"cubicTex%iD.cu"' % self.texture.ndim)
return headers
def __str__(self):
from pystencils.backends.cuda_backend import CudaBackend
return CudaBackend()(self)
def __repr__(self):
return str(self)
class TextureObject(TextureDeclaration):
"""
A CUDA texture object. Opposed to :class:`.TextureDeclaration` it is not declared globally but
used as a function argument for the kernel call.
Like :class:`.TextureDeclaration` it defines :class:`.TextureAccess` symbols.
Just the printing representation is a bit different.
"""
pass
def dtype_supports_textures(dtype):
"""
Returns whether CUDA natively supports texture fetches with this numpy dtype.
The maximum word size for a texture fetch is four bytes.
With this trick also larger dtypes can be fetched:
https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
"""
if hasattr(dtype, 'numpy_dtype'):
dtype = dtype.numpy_dtype
if isinstance(dtype, type):
return dtype().itemsize <= 4
return dtype.itemsize <= 4
......@@ -83,7 +83,6 @@ class CreateKernelConfig:
Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
"""
use_textures_for_interpolation: bool = True
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
"""
List of extra optimizations to perform first on the AST.
......@@ -257,8 +256,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
indexing_creator=indexing_creator_from_params(config.gpu_indexing,
config.gpu_indexing_params),
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check,
use_textures_for_interpolation=config.use_textures_for_interpolation)
skip_independence_check=config.skip_independence_check)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
......@@ -334,8 +332,7 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
config.index_fields,
type_info=config.data_type,
coordinate_names=config.coordinate_names,
indexing_creator=idx_creator,
use_textures_for_interpolation=config.use_textures_for_interpolation)
indexing_creator=idx_creator)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
......@@ -448,7 +445,6 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
inner_assignment = []
for assignment in assignments:
direction = stencil[assignment.lhs.index[0]]
inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs))
last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]),
Block(inner_assignment), outer_assignment)
......
......@@ -832,18 +832,12 @@ class KernelConstraintsCheck:
return ast.SympyAssignment(new_lhs, new_rhs)
def process_expression(self, rhs, type_constants=True):
from pystencils.interpolation_astnodes import InterpolatorAccess
self._update_accesses_rhs(rhs)
if isinstance(rhs, AbstractField.AbstractAccess):
self.fields_read.add(rhs.field)
self.fields_read.update(rhs.indirect_addressing_fields)
return rhs
elif isinstance(rhs, InterpolatorAccess):
new_args = [self.process_expression(arg, type_constants) for arg in rhs.offsets]
if new_args:
rhs.offsets = new_args
return rhs
elif isinstance(rhs, ImaginaryUnit):
return TypedImaginaryUnit(create_type(self._type_for_symbol['_complex_type']))
elif isinstance(rhs, TypedSymbol):
......@@ -1339,68 +1333,6 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int:
return coordinates_taken_into_account
def implement_interpolations(ast_node: ast.Node,
implement_by_texture_accesses: bool = False,
vectorize: bool = False,
use_hardware_interpolation_for_f32=True):
from pystencils.interpolation_astnodes import (InterpolatorAccess, TextureCachedField)
# TODO: perform this function on assignments, when unify_shape_symbols allows differently sized fields
assert not(implement_by_texture_accesses and vectorize), \
"can only implement interpolations either by texture accesses or CPU vectorization"
FLOAT32_T = create_type('float32')
interpolation_accesses = ast_node.atoms(InterpolatorAccess)
if not interpolation_accesses:
return ast_node
def can_use_hw_interpolation(i):
return (use_hardware_interpolation_for_f32
and implement_by_texture_accesses
and i.dtype == FLOAT32_T
and isinstance(i.symbol.interpolator, TextureCachedField))
if implement_by_texture_accesses:
for i in interpolation_accesses:
from pystencils.interpolation_astnodes import _InterpolationSymbol
try:
import pycuda.driver as cuda
texture = TextureCachedField.from_interpolator(i.interpolator)
if can_use_hw_interpolation(i):
texture.filter_mode = cuda.filter_mode.LINEAR
else:
texture.filter_mode = cuda.filter_mode.POINT
texture.read_as_integer = True
except Exception as e:
raise e
i.symbol = _InterpolationSymbol(str(texture), i.symbol.field, texture)
# from pystencils.math_optimizations import ReplaceOptim, optimize_ast
# ImplementInterpolationByStencils = ReplaceOptim(lambda e: isinstance(e, InterpolatorAccess)
# and not can_use_hw_interpolation(i),
# lambda e: e.implementation_with_stencils()
# )
# RemoveConjugate = ReplaceOptim(lambda e: isinstance(e, sp.conjugate),
# lambda e: e.args[0]
# )
if vectorize:
# TODO can be done in _interpolator_access_to_stencils field.absolute_access == simd_gather
raise NotImplementedError()
else:
substitutions = {i: i.implementation_with_stencils()
for i in interpolation_accesses if not can_use_hw_interpolation(i)}
if isinstance(ast_node, AssignmentCollection):
ast_node = ast_node.subs(substitutions)
else:
ast_node.subs(substitutions)
return ast_node
def adjust_c_single_precision_type(type_for_symbol):
"""Replaces every occurrence of 'float' with 'single' to enforce the numpy single precision type."""
def single_factory():
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
from os.path import dirname, join
import numpy as np
import sympy
import pystencils
from pystencils.interpolation_astnodes import LinearInterpolator
try:
import pyconrad.autoinit
except Exception:
import unittest.mock
pyconrad = unittest.mock.MagicMock()
LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png')
try:
import skimage.io
lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float32)
except Exception:
lenna = np.random.rand(20, 30).astype(np.float32)
def test_rotate_center():
x, y = pystencils.fields('x, y: float32[2d]')
# Rotate around center when setting coordindates origins to field centers
x.set_coordinate_origin_to_field_center()
y.set_coordinate_origin_to_field_center()
rotation_angle = sympy.pi / 5
transform_matrix = sympy.rot_axis3(rotation_angle)[:2, :2]
# Generic matrix transform works like that (for rotation it would be more clever to use transform_matrix.T)
inverse_matrix = transform_matrix.inv()
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(lenna, "lenna")
pyconrad.imshow(rotated, "rotated")
# If distance in input field is twice as close we will see a smaller image
x.coordinate_transform /= 2
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(rotated, "rotated smaller")
# Conversely, if output field has samples 3 times closer we will see a bigger image
y.coordinate_transform /= 3
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(rotated, "rotated bigger")
# coordinate_transform can be any matrix, also with symbols as entries
def main():
test_rotate_center()
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import itertools
from os.path import dirname, join
import numpy as np
import pytest
import sympy
import pystencils
from pystencils.interpolation_astnodes import LinearInterpolator
from pystencils.spatial_coordinates import x_, y_
type_map = {
np.float32: 'float32',
np.float64: 'float64',
np.int32: 'int32',
}
try:
import pyconrad.autoinit
except Exception:
import unittest.mock
pyconrad = unittest.mock.MagicMock()
LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png')
try:
import skimage.io
lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float64)
pyconrad.imshow(lenna)
except Exception:
lenna = np.random.rand(20, 30)
def test_interpolation():
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f).at([x_ + 2.7, y_ + 7.2])
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
pyconrad.imshow(lenna)
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out")
def test_scale_interpolation():
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
for address_mode in ['border', 'wrap', 'clamp', 'mirror']:
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at([0.5 * x_ + 2.7, 0.25 * y_ + 7.2])
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
def test_rotate_interpolation(address_mode):
"""
'wrap', 'mirror' currently fails on new sympy due to conjugate()
"""
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
rotation_angle = sympy.pi / 5
transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_))
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
@pytest.mark.parametrize('dtype', (np.int32, np.float32, np.float64))
@pytest.mark.parametrize('address_mode', ('border', 'wrap', 'clamp', 'mirror'))
@pytest.mark.parametrize('use_textures', ('use_textures', False))
def test_rotate_interpolation_gpu(dtype, address_mode, use_textures):
pytest.importorskip('pycuda')
import pycuda.gpuarray as gpuarray
import pycuda.autoinit # noqa
rotation_angle = sympy.pi / 5
scale = 1
if dtype == np.int32:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna * 255, dtype))
else:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna, dtype))
x_f, y_f = pystencils.fields(f'x,y: {type_map[dtype]} [2d]', ghost_layers=0)
transformed = scale * \
sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) - sympy.Matrix([2, 2])
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments, target=pystencils.Target.GPU,
use_textures_for_interpolation=use_textures)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = gpuarray.zeros_like(lenna_gpu)
kernel(x=lenna_gpu, y=out)
pyconrad.imshow(out,
f"out {address_mode} texture:{use_textures} {type_map[dtype]}")
@pytest.mark.parametrize('address_mode', ['border', 'wrap', 'mirror'])
@pytest.mark.parametrize('dtype', [np.float64, np.float32, np.int32])
@pytest.mark.parametrize('use_textures', ('use_textures', False,))
def test_shift_interpolation_gpu(address_mode, dtype, use_textures):
sver = sympy.__version__.split(".")
if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode == 'mirror':
pytest.skip("% printed as fmod on old sympy")
pytest.importorskip('pycuda')
import pycuda.gpuarray as gpuarray
import pycuda.autoinit # noqa
rotation_angle = 0 # sympy.pi / 5
scale = 1
# shift = - sympy.Matrix([1.5, 1.5])
shift = sympy.Matrix((0.0, 0.0))
if dtype == np.int32:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna * 255, dtype))
else:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna, dtype))
x_f, y_f = pystencils.fields(f'x,y: {type_map[dtype]} [2d]', ghost_layers=0)
if use_textures:
transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift
else:
transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
# print(assignments)
ast = pystencils.create_kernel(assignments, target=pystencils.Target.GPU,
use_textures_for_interpolation=use_textures)
# print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = gpuarray.zeros_like(lenna_gpu)
kernel(x=lenna_gpu, y=out)
pyconrad.imshow(out,
f"out {address_mode} texture:{use_textures} {type_map[dtype]}")
@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
def test_rotate_interpolation_size_change(address_mode):
"""
'wrap', 'mirror' currently fails on new sympy due to conjugate()
"""
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
rotation_angle = sympy.pi / 5
transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_))
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros((100, 150), np.float64)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "small out " + address_mode)
@pytest.mark.parametrize('address_mode, target',
itertools.product(['border', 'wrap', 'clamp', 'mirror'], [pystencils.Target.CPU]))
def test_field_interpolated(address_mode, target):
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
assignments = pystencils.AssignmentCollection({
y_f.center(): x_f.interpolated_access([0.5 * x_ + 2.7, 0.25 * y_ + 7.2], address_mode=address_mode)
})
print(assignments)
ast = pystencils.create_kernel(assignments, target=target)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
def test_spatial_derivative():
x, y = pystencils.fields('x, y: float32[2d]')
tx, ty = pystencils.fields('t_x, t_y: float32[2d]')
diff = sympy.diff(x.interpolated_access((tx.center, ty.center)), tx.center)
print("diff: " + str(diff))
......@@ -87,10 +87,25 @@ def test_sqrt_of_integer():
assignments = [ps.Assignment(tmp, sp.sqrt(3)),
ps.Assignment(f[0], tmp)]
arr = np.array([1], dtype=np.float64)
arr_double = np.array([1], dtype=np.float64)
kernel = ps.create_kernel(assignments).compile()
kernel(f=arr)
assert 1.7 < arr[0] < 1.8
kernel(f=arr_double)
assert 1.7 < arr_double[0] < 1.8
f = ps.fields("f: float32[1D]")
tmp = sp.symbols("tmp")
assignments = [ps.Assignment(tmp, sp.sqrt(3)),
ps.Assignment(f[0], tmp)]
arr_single = np.array([1], dtype=np.float32)
config = ps.CreateKernelConfig(data_type="float32")
kernel = ps.create_kernel(assignments, config=config).compile()
kernel(f=arr_single)
code = ps.get_code_str(kernel.ast)
assert "1.7320508075688772f" in code
assert 1.7 < arr_single[0] < 1.8
def test_integer_comparision():
......