Commit 6a01f3e2 authored by Martin Bauer's avatar Martin Bauer
Browse files

Random number generation support for pystencils

- counter-based philox RNG: counter/key is filled with cell coordinate
  and optional external parameters like block position and time step
- works on CPU and GPU - on CPU only for non-vectorized versions

- introduced more flexible "CustomCodeNode" that can inject
  backend-specific hand-written code
parent 3595c16d
......@@ -13,6 +13,7 @@ from .kernel_decorator import kernel
from .stencils import visualize_stencil_expression
from . import fd
__all__ = ['Field', 'FieldType', 'fields',
'TypedSymbol',
'make_slice',
......
......@@ -14,10 +14,10 @@ from pystencils.astnodes import Node, KernelFunction
from pystencils.data_types import create_type, PointerType, get_type_of_expression, VectorType, cast_func, \
vector_memory_access
__all__ = ['generate_c', 'CustomCppCode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
__all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
def generate_c(ast_node: Node, signature_only: bool = False) -> str:
def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str:
"""Prints an abstract syntax tree node as C or CUDA code.
This function does not need to distinguish between C, C++ or CUDA code, it just prints 'C-like' code as encoded
......@@ -27,12 +27,13 @@ def generate_c(ast_node: Node, signature_only: bool = False) -> str:
Args:
ast_node:
signature_only:
dialect: 'c' or 'cuda'
Returns:
C-like code for the ast node and its descendants
"""
printer = CBackend(signature_only=signature_only,
vector_instruction_set=ast_node.instruction_set)
vector_instruction_set=ast_node.instruction_set,
dialect=dialect)
return printer(ast_node)
......@@ -55,16 +56,15 @@ def get_headers(ast_node: Node) -> Set[str]:
# --------------------------------------- Backend Specific Nodes -------------------------------------------------------
class CustomCppCode(Node):
class CustomCodeNode(Node):
def __init__(self, code, symbols_read, symbols_defined, parent=None):
super(CustomCppCode, self).__init__(parent=parent)
super(CustomCodeNode, self).__init__(parent=parent)
self._code = "\n" + code
self._symbolsRead = set(symbols_read)
self._symbolsDefined = set(symbols_defined)
self.headers = []
@property
def code(self):
def get_code(self, dialect, vector_instruction_set):
return self._code
@property
......@@ -80,7 +80,7 @@ class CustomCppCode(Node):
return self.symbols_defined - self._symbolsRead
class PrintNode(CustomCppCode):
class PrintNode(CustomCodeNode):
# noinspection SpellCheckingInspection
def __init__(self, symbol_to_print):
code = '\nstd::cout << "%s = " << %s << std::endl; \n' % (symbol_to_print.name, symbol_to_print.name)
......@@ -95,7 +95,7 @@ class PrintNode(CustomCppCode):
class CBackend:
def __init__(self, sympy_printer=None,
signature_only=False, vector_instruction_set=None):
signature_only=False, vector_instruction_set=None, dialect='c'):
if sympy_printer is None:
if vector_instruction_set is not None:
self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
......@@ -104,13 +104,14 @@ class CBackend:
else:
self.sympy_printer = sympy_printer
self._vectorInstructionSet = vector_instruction_set
self._vector_instruction_set = vector_instruction_set
self._indent = " "
self._dialect = dialect
self._signatureOnly = signature_only
def __call__(self, node):
prev_is = VectorType.instruction_set
VectorType.instruction_set = self._vectorInstructionSet
VectorType.instruction_set = self._vector_instruction_set
result = str(self._print(node))
VectorType.instruction_set = prev_is
return result
......@@ -120,7 +121,6 @@ class CBackend:
method_name = "_print_" + cls.__name__
if hasattr(self, method_name):
return getattr(self, method_name)(node)
raise NotImplementedError("CBackend does not support node of type " + str(type(node)))
def _print_KernelFunction(self, node):
......@@ -170,8 +170,8 @@ class CBackend:
else:
rhs = node.rhs
return self._vectorInstructionSet[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
self.sympy_printer.doprint(rhs)) + ';'
return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
self.sympy_printer.doprint(rhs)) + ';'
else:
return "%s = %s;" % (self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs))
......@@ -191,9 +191,8 @@ class CBackend:
align = 64
return "free(%s - %d);" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
@staticmethod
def _print_CustomCppCode(node):
return node.code
def _print_CustomCodeNode(self, node):
return node.get_code(self._dialect, self._vector_instruction_set)
def _print_Conditional(self, node):
condition_expr = self.sympy_printer.doprint(node.condition_expr)
......
......@@ -2,7 +2,7 @@ import numpy as np
import sympy as sp
from pystencils.assignment import Assignment
from pystencils import Field, TypedSymbol, create_indexed_kernel
from pystencils.backends.cbackend import CustomCppCode
from pystencils.backends.cbackend import CustomCodeNode
from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array
from pystencils.cache import memorycache
from pystencils.data_types import create_type
......@@ -379,7 +379,7 @@ class BoundaryDataSetter:
return self.index_array[item]
class BoundaryOffsetInfo(CustomCppCode):
class BoundaryOffsetInfo(CustomCodeNode):
# --------------------------- Functions to be used by boundaries --------------------------
......
......@@ -61,6 +61,7 @@ from sysconfig import get_paths
from pystencils import FieldType
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.utils import file_handle_for_atomic_write, atomic_file_write
from pystencils.include import get_pystencils_include_path
def make_python_function(kernel_function_node):
......@@ -463,7 +464,7 @@ class KernelWrapper:
def compile_module(code, code_hash, base_dir):
compiler_config = get_compiler_config()
extra_flags = ['-I' + get_paths()['include']]
extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()]
if compiler_config['os'].lower() == 'windows':
function_prefix = '__declspec(dllexport)'
......@@ -510,7 +511,7 @@ def compile_module(code, code_hash, base_dir):
def compile_and_load(ast):
cache_config = get_cache_config()
code_hash_str = "mod_" + hashlib.sha256(generate_c(ast).encode()).hexdigest()
code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest()
code = ExtensionModuleCode(module_name=code_hash_str)
code.add_function(ast, ast.function_name)
......
import numpy as np
from pystencils.backends.cbackend import generate_c
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.kernelparameters import FieldPointerSymbol
from pystencils.data_types import StructType
from pystencils.field import FieldType
from pystencils.include import get_pystencils_include_path
def make_python_function(kernel_function_node, argument_dict=None):
......@@ -25,12 +26,15 @@ def make_python_function(kernel_function_node, argument_dict=None):
if argument_dict is None:
argument_dict = {}
code = "#include <cstdint>\n"
header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n"
code += str(generate_c(kernel_function_node))
mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"])
code += str(generate_c(kernel_function_node, dialect='cuda'))
mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"],
include_dirs=[get_pystencils_include_path()])
func = mod.get_function(kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
......
import os
def get_pystencils_include_path():
return os.path.dirname(os.path.realpath(__file__))
#include <cstdint>
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
unsigned long long z = (unsigned long long)x ^
((unsigned long long)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
}
\ No newline at end of file
......@@ -100,7 +100,7 @@ def generate_benchmark(ast, likwid=False):
args = {
'likwid': likwid,
'kernel_code': generate_c(ast),
'kernel_code': generate_c(ast, dialect='c'),
'kernelName': ast.function_name,
'fields': fields,
'constants': constants,
......
import sympy as sp
import numpy as np
from pystencils import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
philox_two_doubles_call = """
{result_symbols[0].dtype} {result_symbols[0].name};
{result_symbols[1].dtype} {result_symbols[1].name};
philox_double2({parameters}, {result_symbols[0].name}, {result_symbols[1].name});
"""
philox_four_floats_call = """
{result_symbols[0].dtype} {result_symbols[0].name};
{result_symbols[1].dtype} {result_symbols[1].name};
{result_symbols[2].dtype} {result_symbols[2].name};
{result_symbols[3].dtype} {result_symbols[3].name};
philox_float4({parameters},
{result_symbols[0].name}, {result_symbols[1].name}, {result_symbols[2].name}, {result_symbols[3].name});
"""
class PhiloxTwoDoubles(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float64) for _ in range(2))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self.headers = ['"philox_rand.h"']
self.keys = list(keys)
self._args = (time_step, *sp.sympify(keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in self.args if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def get_code(self, dialect, vector_instruction_set):
parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)] + self.keys
while len(parameters) < 6:
parameters.append(0)
parameters = parameters[:6]
assert len(parameters) == 6
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return philox_two_doubles_call.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=self.result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
def __repr__(self):
return "{}, {} <- PhiloxRNG".format(*self.result_symbols)
class PhiloxFourFloats(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float32) for _ in range(4))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self.headers = ['"philox_rand.h"']
self.keys = list(keys)
self._args = (time_step, *sp.sympify(keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in self.args if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def get_code(self, dialect, vector_instruction_set):
parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)] + self.keys
while len(parameters) < 6:
parameters.append(0)
parameters = parameters[:6]
assert len(parameters) == 6
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return philox_four_floats_call.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=self.result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
def __repr__(self):
return "{}, {}, {}, {} <- PhiloxRNG".format(*self.result_symbols)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment