diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index 441de76aa9b2c7a771ceba7ba4010e67023ac9e7..a6215923d21c32c20338eacf4330350f889aee43 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -1,3 +1,4 @@ +import uuid from typing import Any, List, Optional, Sequence, Set, Union import jinja2 @@ -716,3 +717,7 @@ class DestructuringBindingsForFieldClass(Node): def atoms(self, arg_type) -> Set[Any]: return self.body.atoms(arg_type) | {s for s in self.symbols_defined if isinstance(s, arg_type)} + + +def get_dummy_symbol(dtype='bool'): + return TypedSymbol('dummy%s' % uuid.uuid4().hex, create_type(dtype)) diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index bb621f8037e561115e132ceede66491e6c581a25..88c68b7ea85b796081589ed642e81d1738716b39 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -29,7 +29,12 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy KERNCRAFT_NO_TERNARY_MODE = False -def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str: +class UnsupportedCDialect(Exception): + def __init__(self): + super(UnsupportedCDialect, self).__init__() + + +def generate_c(ast_node: Node, signature_only: bool = False, dialect='c', custom_backend=None) -> str: """Prints an abstract syntax tree node as C or CUDA code. This function does not need to distinguish between C, C++ or CUDA code, it just prints 'C-like' code as encoded @@ -49,9 +54,16 @@ def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str ast_node.global_variables.update(d.symbols_defined) else: ast_node.global_variables = d.symbols_defined - printer = CBackend(signature_only=signature_only, - vector_instruction_set=ast_node.instruction_set, - dialect=dialect) + if custom_backend: + printer = custom_backend + elif dialect == 'c': + printer = CBackend(signature_only=signature_only, + vector_instruction_set=ast_node.instruction_set) + elif dialect == 'cuda': + from pystencils.backends.cuda_backend import CudaBackend + printer = CudaBackend(signature_only=signature_only) + else: + raise UnsupportedCDialect code = printer(ast_node) if not signature_only and isinstance(ast_node, KernelFunction): code = "\n" + code @@ -138,9 +150,9 @@ class CBackend: def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect='c'): if sympy_printer is None: if vector_instruction_set is not None: - self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set, dialect) + self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set) else: - self.sympy_printer = CustomSympyPrinter(dialect) + self.sympy_printer = CustomSympyPrinter() else: self.sympy_printer = sympy_printer @@ -161,12 +173,12 @@ class CBackend: method_name = "_print_" + cls.__name__ if hasattr(self, method_name): return getattr(self, method_name)(node) - raise NotImplementedError("CBackend does not support node of type " + str(type(node))) + raise NotImplementedError(self.__class__ + " does not support node of type " + str(type(node))) def _print_KernelFunction(self, node): function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] launch_bounds = "" - if self._dialect == 'cuda': + if self.__class__ == 'cuda': max_threads = node.indexing.max_threads_per_block() if max_threads: launch_bounds = "__launch_bounds__({}) ".format(max_threads) @@ -238,10 +250,7 @@ class CBackend: return "free(%s - %d);" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align)) def _print_SkipIteration(self, _): - if self._dialect == 'cuda': - return "return;" - else: - return "continue;" + return "continue;" def _print_CustomCodeNode(self, node): return node.get_code(self._dialect, self._vector_instruction_set) @@ -289,10 +298,9 @@ class CBackend: # noinspection PyPep8Naming class CustomSympyPrinter(CCodePrinter): - def __init__(self, dialect): + def __init__(self): super(CustomSympyPrinter, self).__init__() self._float_type = create_type("float32") - self._dialect = dialect if 'Min' in self.known_functions: del self.known_functions['Min'] if 'Max' in self.known_functions: @@ -344,22 +352,13 @@ class CustomSympyPrinter(CCodePrinter): else: return "((%s)(%s))" % (data_type, self._print(arg)) elif isinstance(expr, fast_division): - if self._dialect == "cuda": - return "__fdividef(%s, %s)" % tuple(self._print(a) for a in expr.args) - else: - return "({})".format(self._print(expr.args[0] / expr.args[1])) + return "({})".format(self._print(expr.args[0] / expr.args[1])) elif isinstance(expr, fast_sqrt): - if self._dialect == "cuda": - return "__fsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args) - else: - return "({})".format(self._print(sp.sqrt(expr.args[0]))) + return "({})".format(self._print(sp.sqrt(expr.args[0]))) elif isinstance(expr, vec_any) or isinstance(expr, vec_all): return self._print(expr.args[0]) elif isinstance(expr, fast_inv_sqrt): - if self._dialect == "cuda": - return "__frsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args) - else: - return "({})".format(self._print(1 / sp.sqrt(expr.args[0]))) + return "({})".format(self._print(1 / sp.sqrt(expr.args[0]))) elif expr.func in infix_functions: return "(%s %s %s)" % (self._print(expr.args[0]), infix_functions[expr.func], self._print(expr.args[1])) elif expr.func == int_power_of_2: @@ -389,8 +388,8 @@ class CustomSympyPrinter(CCodePrinter): class VectorizedCustomSympyPrinter(CustomSympyPrinter): SummandInfo = namedtuple("SummandInfo", ['sign', 'term']) - def __init__(self, instruction_set, dialect): - super(VectorizedCustomSympyPrinter, self).__init__(dialect=dialect) + def __init__(self, instruction_set): + super(VectorizedCustomSympyPrinter, self).__init__() self.instruction_set = instruction_set def _scalarFallback(self, func_name, expr, *args, **kwargs): diff --git a/pystencils/backends/cuda_backend.py b/pystencils/backends/cuda_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..15fbde91d92c42fd604002770f23efaede31f34d --- /dev/null +++ b/pystencils/backends/cuda_backend.py @@ -0,0 +1,89 @@ + +from os.path import dirname, join + +from pystencils.astnodes import Node +from pystencils.backends.cbackend import (CBackend, CustomSympyPrinter, + generate_c) +from pystencils.fast_approximation import (fast_division, fast_inv_sqrt, + fast_sqrt) + +CUDA_KNOWN_FUNCTIONS = None +with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f: + lines = f.readlines() + CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l} + + +def generate_cuda(astnode: Node, signature_only: bool = False) -> str: + """Prints an abstract syntax tree node as CUDA code. + + Args: + ast_node: + signature_only: + + Returns: + C-like code for the ast node and its descendants + """ + return generate_c(astnode, signature_only, dialect='cuda') + + +class CudaBackend(CBackend): + + def __init__(self, sympy_printer=None, + signature_only=False): + if not sympy_printer: + sympy_printer = CudaSympyPrinter() + + super().__init__(sympy_printer, signature_only, dialect='cuda') + + def _print_SharedMemoryAllocation(self, node): + code = "__shared__ {dtype} {name}[{num_elements}];" + return code.format(dtype=node.symbol.dtype, + name=self.sympy_printer.doprint(node.symbol.name), + num_elements='*'.join([str(s) for s in node.shared_mem.shape])) + + def _print_ThreadBlockSynchronization(self, node): + code = "__synchtreads();" + return code + + def _print_TextureDeclaration(self, node): + code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % ( + str(node.texture.field.dtype), + node.texture.field.spatial_dimensions, + node.texture + ) + return code + + def _print_SkipIteration(self, _): + return "return;" + + +class CudaSympyPrinter(CustomSympyPrinter): + language = "CUDA" + + def __init__(self): + super(CudaSympyPrinter, self).__init__() + self.known_functions = CUDA_KNOWN_FUNCTIONS + + def _print_TextureAccess(self, node): + + if node.texture.cubic_bspline_interpolation: + template = "cubicTex%iDSimple<%s>(%s, %s)" + else: + template = "tex%iD<%s>(%s, %s)" + + code = template % ( + node.texture.field.spatial_dimensions, + str(node.texture.field.dtype), + str(node.texture), + ', '.join(self._print(o) for o in node.offsets) + ) + return code + + def _print_Function(self, expr): + if isinstance(expr, fast_division): + return "__fdividef(%s, %s)" % tuple(self._print(a) for a in expr.args) + elif isinstance(expr, fast_sqrt): + return "__fsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args) + elif isinstance(expr, fast_inv_sqrt): + return "__frsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args) + return super()._print_Function(expr) diff --git a/pystencils/backends/cuda_known_functions.txt b/pystencils/backends/cuda_known_functions.txt new file mode 100644 index 0000000000000000000000000000000000000000..42cf554ada57d9055bddc659d53c4eaff664275a --- /dev/null +++ b/pystencils/backends/cuda_known_functions.txt @@ -0,0 +1,293 @@ +__prof_trigger +printf + +__syncthreads +__syncthreads_count +__syncthreads_and +__syncthreads_or +__syncwarp +__threadfence +__threadfence_block +__threadfence_system + +atomicAdd +atomicSub +atomicExch +atomicMin +atomicMax +atomicInc +atomicDec +atomicAnd +atomicOr +atomicXor +atomicCAS + +__all_sync +__any_sync +__ballot_sync +__active_mask + +__shfl_sync +__shfl_up_sync +__shfl_down_sync +__shfl_xor_sync + +__match_any_sync +__match_all_sync + +__isGlobal +__isShared +__isConstant +__isLocal + +tex1Dfetch +tex1D +tex2D +tex3D + +rsqrtf +cbrtf +rcbrtf +hypotf +rhypotf +norm3df +rnorm3df +norm4df +rnorm4df +normf +rnormf +expf +exp2f +exp10f +expm1f +logf +log2f +log10f +log1pf +sinf +cosf +tanf +sincosf +sinpif +cospif +sincospif +asinf +acosf +atanf +atan2f +sinhf +coshf +tanhf +asinhf +acoshf +atanhf +powf +erff +erfcf +erfinvf +erfcinvf +erfcxf +normcdff +normcdfinvf +lgammaf +tgammaf +fmaf +frexpf +ldexpf +scalbnf +scalblnf +logbf +ilogbf +j0f +j1f +jnf +y0f +y1f +ynf +cyl_bessel_i0f +cyl_bessel_i1f +fmodf +remainderf +remquof +modff +fdimf +truncf +roundf +rintf +nearbyintf +ceilf +floorf +lrintf +lroundf +llrintf +llroundf + +sqrt +rsqrt +cbrt +rcbrt +hypot +rhypot +norm3d +rnorm3d +norm4d +rnorm4d +norm +rnorm +exp +exp2 +exp10 +expm1 +log +log2 +log10 +log1p +sin +cos +tan +sincos +sinpi +cospi +sincospi +asin +acos +atan +atan2 +sinh +cosh +tanh +asinh +acosh +atanh +pow +erf +erfc +erfinv +erfcinv +erfcx +normcdf +normcdfinv +lgamma +tgamma +fma +frexp +ldexp +scalbn +scalbln +logb +ilogb +j0 +j1 +jn +y0 +y1 +yn +cyl_bessel_i0 +cyl_bessel_i1 +fmod +remainder +remquo +mod +fdim +trunc +round +rint +nearbyint +ceil +floor +lrint +lround +llrint +llround + +__fdividef +__sinf +__cosf +__tanf +__sincosf +__logf +__log2f +__log10f +__expf +__exp10f +__powf + +__fadd_rn +__fsub_rn +__fmul_rn +__fmaf_rn +__frcp_rn +__fsqrt_rn +__frsqrt_rn +__fdiv_rn + +__fadd_rz +__fsub_rz +__fmul_rz +__fmaf_rz +__frcp_rz +__fsqrt_rz +__frsqrt_rz +__fdiv_rz + +__fadd_ru +__fsub_ru +__fmul_ru +__fmaf_ru +__frcp_ru +__fsqrt_ru +__frsqrt_ru +__fdiv_ru + +__fadd_rd +__fsub_rd +__fmul_rd +__fmaf_rd +__frcp_rd +__fsqrt_rd +__frsqrt_rd +__fdiv_rd + +__fdividef +__expf +__exp10f +__logf +__log2f +__log10f +__sinf +__cosf +__sincosf +__tanf +__powf + +__dadd_rn +__dsub_rn +__dmul_rn +__fma_rn +__ddiv_rn +__drcp_rn +__dsqrt_rn + +__dadd_rz +__dsub_rz +__dmul_rz +__fma_rz +__ddiv_rz +__drcp_rz +__dsqrt_rz + +__dadd_ru +__dsub_ru +__dmul_ru +__fma_ru +__ddiv_ru +__drcp_ru +__dsqrt_ru + +__dadd_rd +__dsub_rd +__dmul_rd +__fma_rd +__ddiv_rd +__drcp_rd +__dsqrt_rd diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 3809054ab03056943ecec5167fc935422cf8cfc4..d93955751d48c85b8da06dc84281e2ac7992d328 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -57,13 +57,14 @@ from tempfile import TemporaryDirectory import numpy as np from appdirs import user_cache_dir, user_config_dir +from pystencils import FieldType from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.field import FieldType from pystencils.include import get_pystencils_include_path -from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update +from pystencils.utils import (atomic_file_write, file_handle_for_atomic_write, + recursive_dict_update) -def make_python_function(kernel_function_node): +def make_python_function(kernel_function_node, custom_backend=None): """ Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function @@ -74,7 +75,7 @@ def make_python_function(kernel_function_node): :param kernel_function_node: the abstract syntax tree :return: kernel functor """ - result = compile_and_load(kernel_function_node) + result = compile_and_load(kernel_function_node, custom_backend) return result @@ -433,11 +434,12 @@ def run_compile_step(command): class ExtensionModuleCode: - def __init__(self, module_name='generated'): + def __init__(self, module_name='generated', custom_backend=None): self.module_name = module_name self._ast_nodes = [] self._function_names = [] + self._custom_backend = custom_backend def add_function(self, ast, name=None): self._ast_nodes.append(ast) @@ -461,7 +463,7 @@ class ExtensionModuleCode: for ast, name in zip(self._ast_nodes, self._function_names): old_name = ast.function_name ast.function_name = "kernel_" + name - print(generate_c(ast), file=file) + print(generate_c(ast, custom_backend=self._custom_backend), file=file) print(create_function_boilerplate_code(ast.get_parameters(), name), file=file) ast.function_name = old_name print(create_module_boilerplate_code(self.module_name, self._function_names), file=file) @@ -524,10 +526,11 @@ def compile_module(code, code_hash, base_dir): return lib_file -def compile_and_load(ast): +def compile_and_load(ast, custom_backend=None): cache_config = get_cache_config() - code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest() - code = ExtensionModuleCode(module_name=code_hash_str) + code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c', + custom_backend=custom_backend).encode()).hexdigest() + code = ExtensionModuleCode(module_name=code_hash_str, custom_backend=custom_backend) code.add_function(ast, ast.function_name) if cache_config['object_cache'] is False: diff --git a/pystencils/display_utils.py b/pystencils/display_utils.py index 77eca2e86034d2ab86ea1d506ac367c761ea3b0f..8cdaa4820444cedd1c4cbf8f2db7d7391e3e6344 100644 --- a/pystencils/display_utils.py +++ b/pystencils/display_utils.py @@ -34,7 +34,7 @@ def highlight_cpp(code: str): return HTML(highlight(code, CppLexer(), HtmlFormatter())) -def show_code(ast: KernelFunction): +def show_code(ast: KernelFunction, custom_backend=None): """Returns an object to display generated code (C/C++ or CUDA) Can either be displayed as HTML in Jupyter notebooks or printed as normal string. @@ -47,11 +47,11 @@ def show_code(ast: KernelFunction): self.ast = ast_input def _repr_html_(self): - return highlight_cpp(generate_c(self.ast, dialect=dialect)).__html__() + return highlight_cpp(generate_c(self.ast, dialect=dialect, custom_backend=custom_backend)).__html__() def __str__(self): - return generate_c(self.ast, dialect=dialect) + return generate_c(self.ast, dialect=dialect, custom_backend=custom_backend) def __repr__(self): - return generate_c(self.ast, dialect=dialect) + return generate_c(self.ast, dialect=dialect, custom_backend=custom_backend) return CodeDisplay(ast) diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index 40dbfda8fc9e125d291b71a8c134f111b9d16b89..92738e1e7df9c35dca87ff5f3de84ceeb5bcad8a 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -9,7 +9,7 @@ from pystencils.kernelparameters import FieldPointerSymbol USE_FAST_MATH = True -def make_python_function(kernel_function_node, argument_dict=None): +def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None): """ Creates a kernel function from an abstract syntax tree which was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` @@ -35,7 +35,7 @@ def make_python_function(kernel_function_node, argument_dict=None): code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" - code += str(generate_c(kernel_function_node, dialect='cuda')) + code += str(generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend)) options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: options.append("-use_fast_math") diff --git a/pystencils_tests/test_cuda_known_functions.py b/pystencils_tests/test_cuda_known_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..c249c144b128c28978bc97ae15438fb6a0bda799 --- /dev/null +++ b/pystencils_tests/test_cuda_known_functions.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +import sympy + +import pystencils +from pystencils.astnodes import get_dummy_symbol +from pystencils.backends.cuda_backend import CudaSympyPrinter +from pystencils.data_types import address_of + + +def test_cuda_known_functions(): + printer = CudaSympyPrinter() + print(printer.known_functions) + + x, y = pystencils.fields('x,y: float32 [2d]') + + assignments = pystencils.AssignmentCollection({ + get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2), + y.center(): sympy.Function('rsqrtf')(x[0, 0]) + }) + + ast = pystencils.create_kernel(assignments, 'gpu') + print(pystencils.show_code(ast)) + kernel = ast.compile() + assert(kernel is not None) + + +def test_cuda_but_not_c(): + x, y = pystencils.fields('x,y: float32 [2d]') + + assignments = pystencils.AssignmentCollection({ + get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2), + y.center(): sympy.Function('rsqrtf')(x[0, 0]) + }) + + ast = pystencils.create_kernel(assignments, 'cpu') + code = str(pystencils.show_code(ast)) + assert "Not supported" in code + + +def test_cuda_unknown(): + x, y = pystencils.fields('x,y: float32 [2d]') + + assignments = pystencils.AssignmentCollection({ + get_dummy_symbol(): sympy.Function('wtf')(address_of(y.center()), 2), + }) + + ast = pystencils.create_kernel(assignments, 'gpu') + code = str(pystencils.show_code(ast)) + print(code) + assert "Not supported in CUDA" in code + + +def main(): + test_cuda_known_functions() + test_cuda_but_not_c() + test_cuda_unknown() + + +if __name__ == '__main__': + main() diff --git a/pystencils_tests/test_custom_backends.py b/pystencils_tests/test_custom_backends.py new file mode 100644 index 0000000000000000000000000000000000000000..f68696f137c4438663ce8e3e1a15eaab6ae655a2 --- /dev/null +++ b/pystencils_tests/test_custom_backends.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +from subprocess import CalledProcessError + +import pycuda.driver +import pytest +import sympy + +import pystencils +import pystencils.cpu.cpujit +import pystencils.gpucuda.cudajit +from pystencils.backends.cbackend import CBackend +from pystencils.backends.cuda_backend import CudaBackend + + +class ScreamingBackend(CBackend): + + def _print(self, node): + normal_code = super()._print(node) + return normal_code.upper() + + +class ScreamingGpuBackend(CudaBackend): + + def _print(self, node): + normal_code = super()._print(node) + return normal_code.upper() + + +def test_custom_backends(): + z, x, y = pystencils.fields("z, y, x: [2d]") + + normal_assignments = pystencils.AssignmentCollection([pystencils.Assignment( + z[0, 0], x[0, 0] * sympy.log(x[0, 0] * y[0, 0]))], []) + + ast = pystencils.create_kernel(normal_assignments, target='cpu') + print(pystencils.show_code(ast, ScreamingBackend())) + with pytest.raises(CalledProcessError): + pystencils.cpu.cpujit.make_python_function(ast, custom_backend=ScreamingBackend()) + + ast = pystencils.create_kernel(normal_assignments, target='gpu') + print(pystencils.show_code(ast, ScreamingGpuBackend())) + with pytest.raises(pycuda.driver.CompileError): + pystencils.gpucuda.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend()) + + +def main(): + + test_custom_backends() + + +if __name__ == "__main__": + main()