Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (12)
......@@ -212,6 +212,8 @@ class KernelFunction(Node):
argument_symbols = self._body.undefined_symbols - self.global_variables
parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
if hasattr(self, 'indexing'):
parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
parameters.sort(key=lambda p: p.symbol.name)
return parameters
......@@ -252,14 +254,6 @@ class Block(Node):
return self._nodes
def subs(self, subs_dict) -> None:
new_args = []
for a in self.args:
if isinstance(a, SympyAssignment) and a.is_declaration and a.rhs in subs_dict.keys():
subs_dict[a.lhs] = subs_dict[a.rhs]
else:
new_args.append(a)
self._nodes = new_args
for a in self.args:
a.subs(subs_dict)
......
......@@ -5,7 +5,11 @@ from typing import Set
from sympy.printing.ccode import C89CodePrinter
from pystencils.cpu.vectorization import vec_any, vec_all
from pystencils.fast_approximation import fast_division, fast_sqrt, fast_inv_sqrt
from pystencils.data_types import (PointerType, VectorType, address_of,
cast_func, create_type, reinterpret_cast_func,
get_type_of_expression,
vector_memory_access)
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
try:
from sympy.printing.ccode import C99CodePrinter as CCodePrinter
......@@ -15,8 +19,6 @@ except ImportError:
from pystencils.integer_functions import bitwise_xor, bit_shift_right, bit_shift_left, bitwise_and, \
bitwise_or, modulo_ceil, int_div, int_power_of_2
from pystencils.astnodes import Node, KernelFunction
from pystencils.data_types import create_type, PointerType, get_type_of_expression, VectorType, cast_func, \
vector_memory_access, reinterpret_cast_func
__all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
......@@ -38,10 +40,39 @@ def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str
Returns:
C-like code for the ast node and its descendants
"""
global_declarations = get_global_declarations(ast_node)
for d in global_declarations:
if hasattr(ast_node, "global_variables"):
ast_node.global_variables.update(d.symbols_defined)
else:
ast_node.global_variables = d.symbols_defined
printer = CBackend(signature_only=signature_only,
vector_instruction_set=ast_node.instruction_set,
dialect=dialect)
return printer(ast_node)
code = printer(ast_node)
if not signature_only and isinstance(ast_node, KernelFunction):
code = "\n" + code
for declaration in global_declarations:
code = printer(declaration) + "\n" + code
return code
def get_global_declarations(ast):
global_declarations = []
def visit_node(sub_ast):
if hasattr(sub_ast, "required_global_declarations"):
nonlocal global_declarations
global_declarations += sub_ast.required_global_declarations
if hasattr(sub_ast, "args"):
for node in sub_ast.args:
visit_node(node)
visit_node(ast)
return set(global_declarations)
def get_headers(ast_node: Node) -> Set[str]:
......@@ -276,6 +307,9 @@ class CustomSympyPrinter(CCodePrinter):
if isinstance(expr, reinterpret_cast_func):
arg, data_type = expr.args
return "*((%s)(& %s))" % (PointerType(data_type, restrict=False), self._print(arg))
elif isinstance(expr, address_of):
assert len(expr.args) == 1, "address_of must only have one argument"
return "&(%s)" % self._print(expr.args[0])
elif isinstance(expr, cast_func):
arg, data_type = expr.args
if isinstance(arg, sp.Number):
......
......@@ -14,6 +14,33 @@ from pystencils.utils import all_equal
from sympy.logic.boolalg import Boolean
# noinspection PyPep8Naming
class address_of(sp.Function):
is_Atom = True
def __new__(cls, arg):
obj = sp.Function.__new__(cls, arg)
return obj
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
@property
def dtype(self):
if hasattr(self.args[0], 'dtype'):
return PointerType(self.args[0].dtype, restrict=True)
else:
return PointerType('void', restrict=True)
# noinspection PyPep8Naming
class cast_func(sp.Function):
is_Atom = True
......
......@@ -292,7 +292,11 @@ class Field(AbstractField):
self.latex_name = None # type: Optional[str]
def new_field_with_different_name(self, new_name):
return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
if self.has_fixed_shape:
return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
else:
return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype,
self.index_dimensions, self._layout, self.index_shape, self.field_type)
@property
def spatial_dimensions(self) -> int:
......
......@@ -7,9 +7,7 @@ from pystencils.slicing import normalize_slice
from pystencils.data_types import TypedSymbol, create_type
from functools import partial
from pystencils.sympyextensions import prod
AUTO_BLOCK_SIZE_LIMITING = False
from pystencils.sympyextensions import prod, is_integer_sequence
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
......@@ -66,6 +64,9 @@ class AbstractIndexing(abc.ABC):
"""Return maximal number of threads per block for launch bounds. If this cannot be determined without
knowing the array shape return None for unknown """
@abc.abstractmethod
def symbolic_parameters(self):
"""Set of symbols required in call_parameters code"""
# -------------------------------------------- Implementations ---------------------------------------------------------
......@@ -81,17 +82,26 @@ class BlockIndexing(AbstractIndexing):
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
"""
def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)):
if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if permute_block_size_dependent_on_layout:
block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
if AUTO_BLOCK_SIZE_LIMITING:
block_size = self.limit_block_size_to_device_maximum(block_size)
self._block_size = block_size
if maximum_block_size == 'auto':
# Get device limits
import pycuda.driver as cuda
# noinspection PyUnresolvedReferences
import pycuda.autoinit # NOQA
da = cuda.device_attribute
device = cuda.Context.get_device()
maximum_block_size = tuple(device.get_attribute(a)
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
......@@ -122,6 +132,7 @@ class BlockIndexing(AbstractIndexing):
adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
block_size = tuple(adapted_block_size) + extend_bs
block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
extend_gr = (1,) * (3 - len(grid))
......@@ -137,66 +148,6 @@ class BlockIndexing(AbstractIndexing):
condition = sp.And(condition, c)
return Block([Conditional(condition, kernel_content)])
@staticmethod
def limit_block_size_to_device_maximum(block_size):
"""Changes block size to match device limits.
* if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
* next, if one component is still too big, the component which is too big is divided by 2 and the smallest
component is multiplied by 2, such that the total amount of threads stays the same
Returns:
the altered block_size
"""
# Get device limits
import pycuda.driver as cuda
# noinspection PyUnresolvedReferences
import pycuda.autoinit # NOQA
da = cuda.device_attribute
device = cuda.Context.get_device()
block_size = list(block_size)
max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
max_block_size = [device.get_attribute(a)
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
def prod(seq):
result = 1
for e in seq:
result *= e
return result
def get_index_of_too_big_element():
for i, bs in enumerate(block_size):
if bs > max_block_size[i]:
return i
return None
def get_index_of_too_small_element():
for i, bs in enumerate(block_size):
if bs // 2 <= max_block_size[i]:
return i
return None
# Reduce the total number of threads if necessary
while prod(block_size) > max_threads:
item_to_reduce = block_size.index(max(block_size))
for j, block_size_entry in enumerate(block_size):
if block_size_entry > max_block_size[j]:
item_to_reduce = j
block_size[item_to_reduce] //= 2
# Cap individual elements
too_big_element_index = get_index_of_too_big_element()
while too_big_element_index is not None:
too_small_element_index = get_index_of_too_small_element()
block_size[too_small_element_index] *= 2
block_size[too_big_element_index] //= 2
too_big_element_index = get_index_of_too_big_element()
return tuple(block_size)
@staticmethod
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
"""Shrinks the block_size if there are too many registers used per multiprocessor.
......@@ -230,6 +181,8 @@ class BlockIndexing(AbstractIndexing):
@staticmethod
def permute_block_size_according_to_layout(block_size, layout):
"""Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
if not is_integer_sequence(block_size):
return block_size
sorted_block_size = list(sorted(block_size, reverse=True))
while len(sorted_block_size) > len(layout):
sorted_block_size[0] *= sorted_block_size[-1]
......@@ -241,7 +194,13 @@ class BlockIndexing(AbstractIndexing):
return tuple(result[:len(layout)])
def max_threads_per_block(self):
return prod(self._block_size)
if is_integer_sequence(self._block_size):
return prod(self._block_size)
else:
return None
def symbolic_parameters(self):
return set(b for b in self._block_size if isinstance(b, sp.Symbol))
class LineIndexing(AbstractIndexing):
......@@ -293,6 +252,9 @@ class LineIndexing(AbstractIndexing):
def max_threads_per_block(self):
return None
def symbolic_parameters(self):
return set()
# -------------------------------------- Helper functions --------------------------------------------------------------
......
......@@ -28,7 +28,7 @@ class AssignmentCollection:
# ------------------------------- Creation & Inplace Manipulation --------------------------------------------------
def __init__(self, main_assignments: Union[List[Assignment], Dict[sp.Expr, sp.Expr]],
subexpressions: Union[List[Assignment], Dict[sp.Expr, sp.Expr]],
subexpressions: Union[List[Assignment], Dict[sp.Expr, sp.Expr]] = {},
simplification_hints: Optional[Dict[str, Any]] = None,
subexpression_symbol_generator: Iterator[sp.Symbol] = None) -> None:
if isinstance(main_assignments, Dict):
......
"""
Test of pystencils.data_types.address_of
"""
from pystencils.data_types import address_of, cast_func, PointerType
import pystencils
from pystencils.simp.simplifications import sympy_cse
import sympy
def test_address_of():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType('int64'))
assignments = pystencils.AssignmentCollection({
s: address_of(x[0, 0]),
y[0, 0]: cast_func(s, 'int64')
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
print(code)
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), 'int64')
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
print(code)
def test_address_of_with_cse():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType('int64'))
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), 'int64'),
x[0, 0]: cast_func(address_of(x[0, 0]), 'int64') + 1
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
assignments_cse = sympy_cse(assignments)
ast = pystencils.create_kernel(assignments_cse)
code = pystencils.show_code(ast)
print(code)
def main():
test_address_of()
test_address_of_with_cse()
if __name__ == '__main__':
main()
......@@ -147,11 +147,6 @@ def test_periodicity():
np.testing.assert_equal(cpu_result, gpu_result)
def test_block_size_limiting():
res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096))
assert all(r < 4096 for r in res)
def test_block_indexing():
f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import sympy
import pystencils.astnodes
from pystencils.backends.cbackend import CBackend
from pystencils.data_types import TypedSymbol
class BogusDeclaration(pystencils.astnodes.Node):
"""Base class for all AST nodes."""
def __init__(self, parent=None):
self.parent = parent
@property
def args(self):
"""Returns all arguments/children of this node."""
raise set()
@property
def symbols_defined(self):
"""Set of symbols which are defined by this node."""
return {TypedSymbol('Foo', 'double')}
@property
def undefined_symbols(self):
"""Symbols which are used but are not defined inside this node."""
set()
def subs(self, subs_dict):
"""Inplace! substitute, similar to sympy's but modifies the AST inplace."""
for a in self.args:
a.subs(subs_dict)
@property
def func(self):
return self.__class__
def atoms(self, arg_type):
"""Returns a set of all descendants recursively, which are an instance of the given type."""
result = set()
for arg in self.args:
if isinstance(arg, arg_type):
result.add(arg)
result.update(arg.atoms(arg_type))
return result
class BogusUsage(pystencils.astnodes.Node):
"""Base class for all AST nodes."""
def __init__(self, requires_global: bool, parent=None):
self.parent = parent
if requires_global:
self.required_global_declarations = [BogusDeclaration()]
@property
def args(self):
"""Returns all arguments/children of this node."""
return set()
@property
def symbols_defined(self):
"""Set of symbols which are defined by this node."""
return set()
@property
def undefined_symbols(self):
"""Symbols which are used but are not defined inside this node."""
return {TypedSymbol('Foo', 'double')}
def subs(self, subs_dict):
"""Inplace! substitute, similar to sympy's but modifies the AST inplace."""
for a in self.args:
a.subs(subs_dict)
@property
def func(self):
return self.__class__
def atoms(self, arg_type):
"""Returns a set of all descendants recursively, which are an instance of the given type."""
result = set()
for arg in self.args:
if isinstance(arg, arg_type):
result.add(arg)
result.update(arg.atoms(arg_type))
return result
def test_global_definitions_with_global_symbol():
# Teach our printer to print new ast nodes
CBackend._print_BogusUsage = lambda _, __: "// Bogus would go here"
CBackend._print_BogusDeclaration = lambda _, __: "// Declaration would go here"
z, x, y = pystencils.fields("z, y, x: [2d]")
normal_assignments = pystencils.AssignmentCollection([pystencils.Assignment(
z[0, 0], x[0, 0] * sympy.log(x[0, 0] * y[0, 0]))], [])
ast = pystencils.create_kernel(normal_assignments)
print(pystencils.show_code(ast))
ast.body.append(BogusUsage(requires_global=True))
print(pystencils.show_code(ast))
kernel = ast.compile()
assert kernel is not None
assert TypedSymbol('Foo', 'double') not in [p.symbol for p in ast.get_parameters()]
def test_global_definitions_without_global_symbol():
# Teach our printer to print new ast nodes
CBackend._print_BogusUsage = lambda _, __: "// Bogus would go here"
CBackend._print_BogusDeclaration = lambda _, __: "// Declaration would go here"
z, x, y = pystencils.fields("z, y, x: [2d]")
normal_assignments = pystencils.AssignmentCollection([pystencils.Assignment(
z[0, 0], x[0, 0] * sympy.log(x[0, 0] * y[0, 0]))], [])
ast = pystencils.create_kernel(normal_assignments)
print(pystencils.show_code(ast))
ast.body.append(BogusUsage(requires_global=False))
print(pystencils.show_code(ast))
kernel = ast.compile()
assert kernel is not None
assert TypedSymbol('Foo', 'double') in [p.symbol for p in ast.get_parameters()]
def main():
test_global_definitions_with_global_symbol()
test_global_definitions_without_global_symbol()
if __name__ == '__main__':
main()
......@@ -25,6 +25,26 @@ def test_vector_type_propagation():
np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
def test_inplace_update():
shape = (9, 9, 3)
arr = np.ones(shape, order='f')
@ps.kernel
def update_rule(s):
f = ps.fields("f(3) : [2D]", f=arr)
s.tmp0 @= f(0)
s.tmp1 @= f(1)
s.tmp2 @= f(2)
f0, f1, f2 = f(0), f(1), f(2)
f0 @= 2 * s.tmp0
f1 @= 2 * s.tmp0
f2 @= 2 * s.tmp0
ast = ps.create_kernel(update_rule, cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
kernel(f=arr)
np.testing.assert_equal(arr, 2)
def test_vectorization_fixed_size():
configurations = []
# Fixed size - multiple of four
......