diff --git a/__init__.py b/__init__.py index d7ede400e193784e4c0b0c4c8ab542737f56ecb6..c44788a53a65a4459bccdbcd5df2583ee6ad835c 100644 --- a/__init__.py +++ b/__init__.py @@ -13,6 +13,7 @@ from .kernel_decorator import kernel from .stencils import visualize_stencil_expression from . import fd + __all__ = ['Field', 'FieldType', 'fields', 'TypedSymbol', 'make_slice', diff --git a/backends/cbackend.py b/backends/cbackend.py index a11ef1305356f9897f0a9b944cb371874f239b74..83dfc48ff65aa88bee5abe2be26154f978c9dc01 100644 --- a/backends/cbackend.py +++ b/backends/cbackend.py @@ -14,10 +14,10 @@ from pystencils.astnodes import Node, KernelFunction from pystencils.data_types import create_type, PointerType, get_type_of_expression, VectorType, cast_func, \ vector_memory_access -__all__ = ['generate_c', 'CustomCppCode', 'PrintNode', 'get_headers', 'CustomSympyPrinter'] +__all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSympyPrinter'] -def generate_c(ast_node: Node, signature_only: bool = False) -> str: +def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str: """Prints an abstract syntax tree node as C or CUDA code. This function does not need to distinguish between C, C++ or CUDA code, it just prints 'C-like' code as encoded @@ -27,12 +27,13 @@ def generate_c(ast_node: Node, signature_only: bool = False) -> str: Args: ast_node: signature_only: - + dialect: 'c' or 'cuda' Returns: C-like code for the ast node and its descendants """ printer = CBackend(signature_only=signature_only, - vector_instruction_set=ast_node.instruction_set) + vector_instruction_set=ast_node.instruction_set, + dialect=dialect) return printer(ast_node) @@ -55,16 +56,15 @@ def get_headers(ast_node: Node) -> Set[str]: # --------------------------------------- Backend Specific Nodes ------------------------------------------------------- -class CustomCppCode(Node): +class CustomCodeNode(Node): def __init__(self, code, symbols_read, symbols_defined, parent=None): - super(CustomCppCode, self).__init__(parent=parent) + super(CustomCodeNode, self).__init__(parent=parent) self._code = "\n" + code self._symbolsRead = set(symbols_read) self._symbolsDefined = set(symbols_defined) self.headers = [] - @property - def code(self): + def get_code(self, dialect, vector_instruction_set): return self._code @property @@ -80,7 +80,7 @@ class CustomCppCode(Node): return self.symbols_defined - self._symbolsRead -class PrintNode(CustomCppCode): +class PrintNode(CustomCodeNode): # noinspection SpellCheckingInspection def __init__(self, symbol_to_print): code = '\nstd::cout << "%s = " << %s << std::endl; \n' % (symbol_to_print.name, symbol_to_print.name) @@ -95,7 +95,7 @@ class PrintNode(CustomCppCode): class CBackend: def __init__(self, sympy_printer=None, - signature_only=False, vector_instruction_set=None): + signature_only=False, vector_instruction_set=None, dialect='c'): if sympy_printer is None: if vector_instruction_set is not None: self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set) @@ -104,13 +104,14 @@ class CBackend: else: self.sympy_printer = sympy_printer - self._vectorInstructionSet = vector_instruction_set + self._vector_instruction_set = vector_instruction_set self._indent = " " + self._dialect = dialect self._signatureOnly = signature_only def __call__(self, node): prev_is = VectorType.instruction_set - VectorType.instruction_set = self._vectorInstructionSet + VectorType.instruction_set = self._vector_instruction_set result = str(self._print(node)) VectorType.instruction_set = prev_is return result @@ -120,7 +121,6 @@ class CBackend: method_name = "_print_" + cls.__name__ if hasattr(self, method_name): return getattr(self, method_name)(node) - raise NotImplementedError("CBackend does not support node of type " + str(type(node))) def _print_KernelFunction(self, node): @@ -170,8 +170,8 @@ class CBackend: else: rhs = node.rhs - return self._vectorInstructionSet[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]), - self.sympy_printer.doprint(rhs)) + ';' + return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]), + self.sympy_printer.doprint(rhs)) + ';' else: return "%s = %s;" % (self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs)) @@ -191,9 +191,8 @@ class CBackend: align = 64 return "free(%s - %d);" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align)) - @staticmethod - def _print_CustomCppCode(node): - return node.code + def _print_CustomCodeNode(self, node): + return node.get_code(self._dialect, self._vector_instruction_set) def _print_Conditional(self, node): condition_expr = self.sympy_printer.doprint(node.condition_expr) diff --git a/boundaries/boundaryhandling.py b/boundaries/boundaryhandling.py index 69c992c3b6d5a1ec32425496625b2e08fd3339d2..384c10d4ff30b0ad983fac2dabd28a72df54a979 100644 --- a/boundaries/boundaryhandling.py +++ b/boundaries/boundaryhandling.py @@ -2,7 +2,7 @@ import numpy as np import sympy as sp from pystencils.assignment import Assignment from pystencils import Field, TypedSymbol, create_indexed_kernel -from pystencils.backends.cbackend import CustomCppCode +from pystencils.backends.cbackend import CustomCodeNode from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array from pystencils.cache import memorycache from pystencils.data_types import create_type @@ -379,7 +379,7 @@ class BoundaryDataSetter: return self.index_array[item] -class BoundaryOffsetInfo(CustomCppCode): +class BoundaryOffsetInfo(CustomCodeNode): # --------------------------- Functions to be used by boundaries -------------------------- diff --git a/cpu/cpujit.py b/cpu/cpujit.py index d5c38629be18bf239d1be5d4c8d56edd620e3fdc..56cd2cc3a728e36a2aca585599fcc7066b226051 100644 --- a/cpu/cpujit.py +++ b/cpu/cpujit.py @@ -61,6 +61,7 @@ from sysconfig import get_paths from pystencils import FieldType from pystencils.backends.cbackend import generate_c, get_headers from pystencils.utils import file_handle_for_atomic_write, atomic_file_write +from pystencils.include import get_pystencils_include_path def make_python_function(kernel_function_node): @@ -463,7 +464,7 @@ class KernelWrapper: def compile_module(code, code_hash, base_dir): compiler_config = get_compiler_config() - extra_flags = ['-I' + get_paths()['include']] + extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] if compiler_config['os'].lower() == 'windows': function_prefix = '__declspec(dllexport)' @@ -510,7 +511,7 @@ def compile_module(code, code_hash, base_dir): def compile_and_load(ast): cache_config = get_cache_config() - code_hash_str = "mod_" + hashlib.sha256(generate_c(ast).encode()).hexdigest() + code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest() code = ExtensionModuleCode(module_name=code_hash_str) code.add_function(ast, ast.function_name) diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py index 8383813a41f65c0db60940ac03dc999e2eb8853e..ee40bd0c5ae81130151fa9c0b0e083a335ef6d3c 100644 --- a/gpucuda/cudajit.py +++ b/gpucuda/cudajit.py @@ -1,8 +1,9 @@ import numpy as np -from pystencils.backends.cbackend import generate_c +from pystencils.backends.cbackend import generate_c, get_headers from pystencils.kernelparameters import FieldPointerSymbol from pystencils.data_types import StructType from pystencils.field import FieldType +from pystencils.include import get_pystencils_include_path def make_python_function(kernel_function_node, argument_dict=None): @@ -25,12 +26,15 @@ def make_python_function(kernel_function_node, argument_dict=None): if argument_dict is None: argument_dict = {} - code = "#include <cstdint>\n" + header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node)) + includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) + + code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" - code += str(generate_c(kernel_function_node)) - - mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]) + code += str(generate_c(kernel_function_node, dialect='cuda')) + mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"], + include_dirs=[get_pystencils_include_path()]) func = mod.get_function(kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() diff --git a/include/__init__.py b/include/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03a19735db12d1de30f46104f85c0cb9f7f6da20 --- /dev/null +++ b/include/__init__.py @@ -0,0 +1,5 @@ +import os + + +def get_pystencils_include_path(): + return os.path.dirname(os.path.realpath(__file__)) diff --git a/include/philox_rand.h b/include/philox_rand.h new file mode 100644 index 0000000000000000000000000000000000000000..bbf1c1ac0ed015539743e4f027ed875209d08f61 --- /dev/null +++ b/include/philox_rand.h @@ -0,0 +1,104 @@ +#include <cstdint> + +#ifndef __CUDA_ARCH__ +#define QUALIFIERS inline +#else +#define QUALIFIERS static __forceinline__ __device__ +#endif + +#define PHILOX_W32_0 (0x9E3779B9) +#define PHILOX_W32_1 (0xBB67AE85) +#define PHILOX_M4x32_0 (0xD2511F53) +#define PHILOX_M4x32_1 (0xCD9E8D57) +#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16) +#define TWOPOW32_INV_FLOAT (2.3283064e-10f) + +typedef std::uint32_t uint32; +typedef std::uint64_t uint64; + + +QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) +{ +#ifndef __CUDA_ARCH__ + // host code + uint64 product = ((uint64)a) * ((uint64)b); + *hip = product >> 32; + return (uint32)product; +#else + // device code + *hip = __umulhi(a,b); + return a*b; +#endif +} + +QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key) +{ + uint32 hi0; + uint32 hi1; + uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0); + uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1); + + ctr[0] = hi1^ctr[1]^key[0]; + ctr[1] = lo1; + ctr[2] = hi0^ctr[3]^key[1]; + ctr[3] = lo0; +} + +QUALIFIERS void _philox4x32bumpkey(uint32* key) +{ + key[0] += PHILOX_W32_0; + key[1] += PHILOX_W32_1; +} + +QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y) +{ + unsigned long long z = (unsigned long long)x ^ + ((unsigned long long)y << (53 - 32)); + return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0); +} + + +QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, double & rnd1, double & rnd2) +{ + uint32 key[2] = {key0, key1}; + uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + _philox4x32round(ctr, key); // 1 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10 + + rnd1 = _uniform_double_hq(ctr[0], ctr[1]); + rnd2 = _uniform_double_hq(ctr[2], ctr[3]); +} + + + +QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + float & rnd1, float & rnd2, float & rnd3, float & rnd4) +{ + uint32 key[2] = {key0, key1}; + uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + _philox4x32round(ctr, key); // 1 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10 + + rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); + rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); + rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); + rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); +} \ No newline at end of file diff --git a/kerncraft_coupling/generate_benchmark.py b/kerncraft_coupling/generate_benchmark.py index ed0e65999efd15a08e4f95de46913640033f1644..2c27f3b0e7e5412fb7cdda248cf1072b8e835ef7 100644 --- a/kerncraft_coupling/generate_benchmark.py +++ b/kerncraft_coupling/generate_benchmark.py @@ -100,7 +100,7 @@ def generate_benchmark(ast, likwid=False): args = { 'likwid': likwid, - 'kernel_code': generate_c(ast), + 'kernel_code': generate_c(ast, dialect='c'), 'kernelName': ast.function_name, 'fields': fields, 'constants': constants, diff --git a/random.py b/random.py new file mode 100644 index 0000000000000000000000000000000000000000..bb3f7e29dde17e74ed4cf6548ccd16875cd5624c --- /dev/null +++ b/random.py @@ -0,0 +1,111 @@ +import sympy as sp +import numpy as np +from pystencils import TypedSymbol +from pystencils.astnodes import LoopOverCoordinate +from pystencils.backends.cbackend import CustomCodeNode + +philox_two_doubles_call = """ +{result_symbols[0].dtype} {result_symbols[0].name}; +{result_symbols[1].dtype} {result_symbols[1].name}; +philox_double2({parameters}, {result_symbols[0].name}, {result_symbols[1].name}); +""" + +philox_four_floats_call = """ +{result_symbols[0].dtype} {result_symbols[0].name}; +{result_symbols[1].dtype} {result_symbols[1].name}; +{result_symbols[2].dtype} {result_symbols[2].name}; +{result_symbols[3].dtype} {result_symbols[3].name}; +philox_float4({parameters}, + {result_symbols[0].name}, {result_symbols[1].name}, {result_symbols[2].name}, {result_symbols[3].name}); + +""" + + +class PhiloxTwoDoubles(CustomCodeNode): + + def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)): + self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float64) for _ in range(2)) + + symbols_read = [s for s in keys if isinstance(s, sp.Symbol)] + super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols) + self._time_step = time_step + self.headers = ['"philox_rand.h"'] + self.keys = list(keys) + self._args = (time_step, *sp.sympify(keys)) + self._dim = dim + + @property + def args(self): + return self._args + + @property + def undefined_symbols(self): + result = {a for a in self.args if isinstance(a, sp.Symbol)} + loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i) + for i in range(self._dim)] + result.update(loop_counters) + return result + + def get_code(self, dialect, vector_instruction_set): + parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + for i in range(self._dim)] + self.keys + + while len(parameters) < 6: + parameters.append(0) + parameters = parameters[:6] + + assert len(parameters) == 6 + + if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None): + return philox_two_doubles_call.format(parameters=', '.join(str(p) for p in parameters), + result_symbols=self.result_symbols) + else: + raise NotImplementedError("Not yet implemented for this backend") + + def __repr__(self): + return "{}, {} <- PhiloxRNG".format(*self.result_symbols) + + +class PhiloxFourFloats(CustomCodeNode): + + def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)): + self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float32) for _ in range(4)) + symbols_read = [s for s in keys if isinstance(s, sp.Symbol)] + + super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols) + self._time_step = time_step + self.headers = ['"philox_rand.h"'] + self.keys = list(keys) + self._args = (time_step, *sp.sympify(keys)) + self._dim = dim + + @property + def args(self): + return self._args + + @property + def undefined_symbols(self): + result = {a for a in self.args if isinstance(a, sp.Symbol)} + loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i) + for i in range(self._dim)] + result.update(loop_counters) + return result + + def get_code(self, dialect, vector_instruction_set): + parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + for i in range(self._dim)] + self.keys + + while len(parameters) < 6: + parameters.append(0) + parameters = parameters[:6] + + assert len(parameters) == 6 + + if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None): + return philox_four_floats_call.format(parameters=', '.join(str(p) for p in parameters), + result_symbols=self.result_symbols) + else: + raise NotImplementedError("Not yet implemented for this backend") + + def __repr__(self): + return "{}, {}, {}, {} <- PhiloxRNG".format(*self.result_symbols)