diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 00fe25c787035ca00ce4fb9a890046786ff4463e..99ce2d6fbfbb31f87fdc90aec22fa203386935fe 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -134,11 +134,22 @@ def create_folder(path, is_file): pass +def get_llc_command(): + """Try to get executable for llvm's IR compiler llc + + We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6 + """ + candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6'] + found_executables = (e for e in candidates if shutil.which(e)) + return next(found_executables, None) + + def read_config(): if platform.system().lower() == 'linux': default_compiler_config = OrderedDict([ ('os', 'linux'), ('command', 'g++'), + ('llc_command', get_llc_command() or 'llc'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) @@ -146,6 +157,7 @@ def read_config(): default_compiler_config = OrderedDict([ ('os', 'windows'), ('msvc_version', 'latest'), + ('llc_command', get_llc_command() or 'llc'), ('arch', 'x64'), ('flags', '/Ox /fp:fast /openmp /arch:avx'), ('restrict_qualifier', '__restrict') @@ -154,6 +166,7 @@ def read_config(): default_compiler_config = OrderedDict([ ('os', 'darwin'), ('command', 'clang++'), + ('llc_command', get_llc_command() or 'llc'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) diff --git a/pystencils/data_types.py b/pystencils/data_types.py index faa8867188b36ae3c3f355b09b2dc41d99a98086..fc7f8a4f0e916922297a7314d432627503ba8464 100644 --- a/pystencils/data_types.py +++ b/pystencils/data_types.py @@ -331,7 +331,7 @@ def ctypes_from_llvm(data_type): raise NotImplementedError('Data type %s of %s is not supported yet' % (type(data_type), data_type)) -def to_llvm_type(data_type): +def to_llvm_type(data_type, nvvm_target=False): """ Transforms a given type into ctypes :param data_type: Subclass of Type @@ -340,7 +340,7 @@ def to_llvm_type(data_type): if not ir: raise _ir_importerror if isinstance(data_type, PointerType): - return to_llvm_type(data_type.base_type).as_pointer() + return to_llvm_type(data_type.base_type).as_pointer(1 if nvvm_target else 0) else: return to_llvm_type.map[data_type.numpy_dtype] diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index 4c8701b2599dd9b4570c7d7f8875e0e6b58bef36..eb212119ac8505ac5bf4db3112ef645b8e793f80 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -24,10 +24,10 @@ class ThreadIndexingSymbol(TypedSymbol): __xnew_cached_ = staticmethod(cacheit(__new_stage2__)) -BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] +BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] +THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] +BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] +GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')] class AbstractIndexing(abc.ABC): diff --git a/pystencils/llvm/kernelcreation.py b/pystencils/llvm/kernelcreation.py index 38ac7fe6be818729eae3a935a4c14003f066e849..57e5b73876b643196f5529d05df41a11a1ef01e5 100644 --- a/pystencils/llvm/kernelcreation.py +++ b/pystencils/llvm/kernelcreation.py @@ -3,7 +3,7 @@ from pystencils.transformations import insert_casts def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(), - iteration_slice=None, ghost_layers=None): + iteration_slice=None, ghost_layers=None, target='cpu'): """ Creates an abstract syntax tree for a kernel function, by taking a list of update rules. @@ -25,9 +25,21 @@ def create_kernel(assignments, function_name="kernel", type_info=None, split_gro :return: :class:`pystencils.ast.KernelFunction` node """ - from pystencils.cpu import create_kernel - code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers) + if target == 'cpu': + from pystencils.cpu import create_kernel + code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers) + code._backend = 'llvm' + elif target == 'gpu': + from pystencils.gpucuda.kernelcreation import create_cuda_kernel + code = create_cuda_kernel(assignments, + function_name, + type_info, + iteration_slice=iteration_slice, + ghost_layers=ghost_layers) + code._backend = 'llvm_gpu' + else: + NotImplementedError() code.body = insert_casts(code.body) code._compile_function = make_python_function - code._backend = 'llvm' + return code diff --git a/pystencils/llvm/llvm.py b/pystencils/llvm/llvm.py index cf2207c0831f0d365033dc25ee4c3c701af9940e..1d5223e9509f274e002d755fa19a1a56f9ec016e 100644 --- a/pystencils/llvm/llvm.py +++ b/pystencils/llvm/llvm.py @@ -1,6 +1,7 @@ import functools import llvmlite.ir as ir +import llvmlite.llvmpy.core as lc import sympy as sp from sympy import Indexed, S from sympy.printing.printer import Printer @@ -12,13 +13,39 @@ from pystencils.data_types import ( from pystencils.llvm.control_flow import Loop -def generate_llvm(ast_node, module=None, builder=None): +# From Numba +def set_cuda_kernel(lfunc): + from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type + + m = lfunc.module + + ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1) + md = MetaData.get(m, ops) + + nmd = m.get_or_insert_named_metadata('nvvm.annotations') + nmd.add(md) + + # set nvvm ir version + i32 = ir.IntType(32) + md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)]) + m.add_named_metadata('nvvmir.version', md_ver) + + +# From Numba +def _call_sreg(builder, name): + module = builder.module + fnty = lc.Type.function(lc.Type.int(), ()) + fn = module.get_or_insert_function(fnty, name=name) + return builder.call(fn, ()) + + +def generate_llvm(ast_node, module=None, builder=None, target='cpu'): """Prints the ast as llvm code.""" if module is None: - module = ir.Module() + module = lc.Module() if builder is None: builder = ir.IRBuilder() - printer = LLVMPrinter(module, builder) + printer = LLVMPrinter(module, builder, target=target) return printer._print(ast_node) @@ -26,7 +53,7 @@ def generate_llvm(ast_node, module=None, builder=None): class LLVMPrinter(Printer): """Convert expressions to LLVM IR""" - def __init__(self, module, builder, fn=None, *args, **kwargs): + def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs): self.func_arg_map = kwargs.pop("func_arg_map", {}) super(LLVMPrinter, self).__init__(*args, **kwargs) self.fp_type = ir.DoubleType() @@ -39,6 +66,7 @@ class LLVMPrinter(Printer): self.fn = fn self.ext_fn = {} # keep track of wrappers to external functions self.tmp_var = {} + self.target = target def _add_tmp_var(self, name, value): self.tmp_var[name] = value @@ -163,7 +191,7 @@ class LLVMPrinter(Printer): parameter_type = [] parameters = func.get_parameters() for parameter in parameters: - parameter_type.append(to_llvm_type(parameter.symbol.dtype)) + parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu')) func_type = ir.FunctionType(return_type, tuple(parameter_type)) name = func.function_name fn = ir.Function(self.module, func_type, name) @@ -181,6 +209,9 @@ class LLVMPrinter(Printer): self._print(func.body) self.builder.ret_void() self.fn = fn + if self.target == 'gpu': + set_cuda_kernel(fn) + return fn def _print_Block(self, block): @@ -217,8 +248,10 @@ class LLVMPrinter(Printer): # (From, to) decision = { + (create_composite_type_from_string("int32"), + create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer), (create_composite_type_from_string("int16"), - create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node), + create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer), (create_composite_type_from_string("int"), create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type), (create_composite_type_from_string("int16"), @@ -295,13 +328,21 @@ class LLVMPrinter(Printer): self.builder.branch(after_block) self.builder.position_at_end(false_block) - phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece))) + phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu')) for (val, block) in phi_data: phi.add_incoming(val, block) return phi - # Should have a list of math library functions to validate this. - # TODO function calls to libs + def _print_Conditional(self, node): + cond = self._print(node.condition_expr) + with self.builder.if_else(cond) as (then, otherwise): + with then: + self._print(node.true_block) # emit instructions for when the predicate is true + with otherwise: + self._print(node.false_block) # emit instructions for when the predicate is true + + # No return! + def _print_Function(self, expr): name = expr.func.__name__ e0 = self._print(expr.args[0]) @@ -320,3 +361,19 @@ class LLVMPrinter(Printer): mro = "None" raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s" % (expr, type(expr), mro)) + + # from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics + INDEXING_FUNCTION_MAPPING = { + 'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid', + 'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid', + 'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid', + 'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid' + } + + def _print_ThreadIndexingSymbol(self, node): + symbol_name: str = node.name + function_name, dimension = tuple(symbol_name.split(".")) + function_name = self.INDEXING_FUNCTION_MAPPING[function_name] + name = f"{function_name}.{dimension}" + + return self.builder.zext(_call_sreg(self.builder, name), self.integer) diff --git a/pystencils/llvm/llvmjit.py b/pystencils/llvm/llvmjit.py index 6dc40151e4495d5fa67f9c6081a442a558b1d09a..0baec5452f80e6d8ca3c1b1d760ef23e510dcb0e 100644 --- a/pystencils/llvm/llvmjit.py +++ b/pystencils/llvm/llvmjit.py @@ -1,4 +1,8 @@ import ctypes as ct +import subprocess +from functools import partial +from itertools import chain +from os.path import exists, join import llvmlite.binding as llvm import llvmlite.ir as ir @@ -98,11 +102,12 @@ def make_python_function_incomplete_params(kernel_function_node, argument_dict, def generate_and_jit(ast): - gen = generate_llvm(ast) + target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu' + gen = generate_llvm(ast, target=target) if isinstance(gen, ir.Module): - return compile_llvm(gen) + return compile_llvm(gen, target, ast) else: - return compile_llvm(gen.module) + return compile_llvm(gen.module, target, ast) def make_python_function(ast, argument_dict={}, func=None): @@ -117,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None): return lambda: func(*args) -def compile_llvm(module): - jit = Jit() +def compile_llvm(module, target='cpu', ast=None): + jit = CudaJit(ast) if target == "gpu" else Jit() jit.parse(module) jit.optimize() jit.compile() @@ -224,3 +229,95 @@ class Jit(object): fptr = self.fptr[name] fptr.jit = self return fptr + + +# Following code more or less from numba +class CudaJit(Jit): + + CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda', + 64: 'nvptx64-nvidia-cuda'} + MACHINE_BITS = tuple.__itemsize__ * 8 + data_layout = { + 32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-' + 'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'), + 64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-' + 'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')} + + default_data_layout = data_layout[MACHINE_BITS] + + def __init__(self, ast): + # super().__init__() + + # self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS]) + self._data_layout = self.default_data_layout[self.MACHINE_BITS] + # self._target_data = llvm.create_target_data(self._data_layout) + self.indexing = ast.indexing + + def optimize(self): + pmb = llvm.create_pass_manager_builder() + pmb.opt_level = 2 + pmb.disable_unit_at_a_time = False + pmb.loop_vectorize = False + pmb.slp_vectorize = False + # TODO possible to pass for functions + pm = llvm.create_module_pass_manager() + pm.add_instruction_combining_pass() + pm.add_function_attrs_pass() + pm.add_constant_merge_pass() + pm.add_licm_pass() + pmb.populate(pm) + pm.run(self.llvmmod) + pm.run(self.llvmmod) + + def write_ll(self, file): + with open(file, 'w') as f: + f.write(str(self.llvmmod)) + + def parse(self, module): + + llvmmod = module + llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS] + llvmmod.data_layout = self.default_data_layout + llvmmod.verify() + llvmmod.name = 'module' + + self._llvmmod = llvm.parse_assembly(str(llvmmod)) + + def compile(self): + from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command + import hashlib + compiler_cache = get_cache_config()['object_cache'] + ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll') + ptx_file = ir_file.replace('.ll', '.ptx') + try: + from pycuda.driver import Context + arch = "sm_%d%d" % Context.get_device().compute_capability() + except Exception: + arch = "sm_35" + + if not exists(ptx_file): + self.write_ll(ir_file) + if 'llc' in get_compiler_config(): + llc_command = get_compiler_config()['llc'] + else: + llc_command = get_llc_command() or 'llc' + + subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file]) + + # cubin_file = ir_file.replace('.ll', '.cubin') + # if not exists(cubin_file): + # subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file]) + import pycuda.driver + + cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file + self.cuda_module = cuda_module + + def __call__(self, func, *args, **kwargs): + shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0] + block_and_thread_numbers = self.indexing.call_parameters(shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers) + + def get_function_ptr(self, name): + return partial(self._call__, name) diff --git a/pystencils_tests/test_jacobi_llvm.py b/pystencils_tests/test_jacobi_llvm.py index 2965cf245c6bf34e001d1ac67ad5b350c1219247..13b22aa4b0235bb98b4789e3b9266bf672985111 100644 --- a/pystencils_tests/test_jacobi_llvm.py +++ b/pystencils_tests/test_jacobi_llvm.py @@ -1,6 +1,8 @@ import numpy as np +import pytest -from pystencils import Assignment, Field +from pystencils import Assignment, Field, show_code +from pystencils.cpu.cpujit import get_llc_command from pystencils.llvm import create_kernel, make_python_function from pystencils.llvm.llvmjit import generate_and_jit @@ -30,6 +32,39 @@ def test_jacobi_fixed_field_size(): np.testing.assert_almost_equal(error, 0.0) +@pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH") +def test_jacobi_fixed_field_size_gpu(): + size = (30, 20) + + import pycuda.autoinit # noqa + from pycuda.gpuarray import to_gpu + + src_field_llvm = np.random.rand(*size) + src_field_py = np.copy(src_field_llvm) + dst_field_llvm = np.zeros(size) + dst_field_py = np.zeros(size) + + f = Field.create_from_numpy_array("f", src_field_py) + d = Field.create_from_numpy_array("d", dst_field_py) + + src_field_llvm = to_gpu(src_field_llvm) + dst_field_llvm = to_gpu(dst_field_llvm) + + jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4) + ast = create_kernel([jacobi], target='gpu') + print(show_code(ast)) + + for x in range(1, size[0] - 1): + for y in range(1, size[1] - 1): + dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] + + src_field_py[x, y - 1] + src_field_py[x, y + 1]) + + jit = generate_and_jit(ast) + jit('kernel', dst_field_llvm, src_field_llvm) + error = np.sum(np.abs(dst_field_py - dst_field_llvm.get())) + np.testing.assert_almost_equal(error, 0.0) + + def test_jacobi_variable_field_size(): size = (3, 3, 3) f = Field.create_generic("f", 3) @@ -52,3 +87,7 @@ def test_jacobi_variable_field_size(): kernel() error = np.sum(np.abs(dst_field_py - dst_field_llvm)) np.testing.assert_almost_equal(error, 0.0) + + +if __name__ == "__main__": + test_jacobi_fixed_field_size_gpu()