Commit 6d875789 authored by Martin Bauer's avatar Martin Bauer
Browse files

Merge branch 'llvm-experiments' into 'master'

Compile CUDA using the LLVM backend

See merge request !53
parents 1db024c5 23e8e04b
......@@ -134,11 +134,22 @@ def create_folder(path, is_file):
pass
def get_llc_command():
"""Try to get executable for llvm's IR compiler llc
We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6
"""
candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6']
found_executables = (e for e in candidates if shutil.which(e))
return next(found_executables, None)
def read_config():
if platform.system().lower() == 'linux':
default_compiler_config = OrderedDict([
('os', 'linux'),
('command', 'g++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__')
])
......@@ -146,6 +157,7 @@ def read_config():
default_compiler_config = OrderedDict([
('os', 'windows'),
('msvc_version', 'latest'),
('llc_command', get_llc_command() or 'llc'),
('arch', 'x64'),
('flags', '/Ox /fp:fast /openmp /arch:avx'),
('restrict_qualifier', '__restrict')
......@@ -154,6 +166,7 @@ def read_config():
default_compiler_config = OrderedDict([
('os', 'darwin'),
('command', 'clang++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__')
])
......
......@@ -331,7 +331,7 @@ def ctypes_from_llvm(data_type):
raise NotImplementedError('Data type %s of %s is not supported yet' % (type(data_type), data_type))
def to_llvm_type(data_type):
def to_llvm_type(data_type, nvvm_target=False):
"""
Transforms a given type into ctypes
:param data_type: Subclass of Type
......@@ -340,7 +340,7 @@ def to_llvm_type(data_type):
if not ir:
raise _ir_importerror
if isinstance(data_type, PointerType):
return to_llvm_type(data_type.base_type).as_pointer()
return to_llvm_type(data_type.base_type).as_pointer(1 if nvvm_target else 0)
else:
return to_llvm_type.map[data_type.numpy_dtype]
......
......@@ -24,10 +24,10 @@ class ThreadIndexingSymbol(TypedSymbol):
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
class AbstractIndexing(abc.ABC):
......
......@@ -3,7 +3,7 @@ from pystencils.transformations import insert_casts
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None):
iteration_slice=None, ghost_layers=None, target='cpu'):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
......@@ -25,9 +25,21 @@ def create_kernel(assignments, function_name="kernel", type_info=None, split_gro
:return: :class:`pystencils.ast.KernelFunction` node
"""
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
if target == 'cpu':
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
code._backend = 'llvm'
elif target == 'gpu':
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
code = create_cuda_kernel(assignments,
function_name,
type_info,
iteration_slice=iteration_slice,
ghost_layers=ghost_layers)
code._backend = 'llvm_gpu'
else:
NotImplementedError()
code.body = insert_casts(code.body)
code._compile_function = make_python_function
code._backend = 'llvm'
return code
import functools
import llvmlite.ir as ir
import llvmlite.llvmpy.core as lc
import sympy as sp
from sympy import Indexed, S
from sympy.printing.printer import Printer
......@@ -12,13 +13,39 @@ from pystencils.data_types import (
from pystencils.llvm.control_flow import Loop
def generate_llvm(ast_node, module=None, builder=None):
# From Numba
def set_cuda_kernel(lfunc):
from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
m = lfunc.module
ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
md = MetaData.get(m, ops)
nmd = m.get_or_insert_named_metadata('nvvm.annotations')
nmd.add(md)
# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
m.add_named_metadata('nvvmir.version', md_ver)
# From Numba
def _call_sreg(builder, name):
module = builder.module
fnty = lc.Type.function(lc.Type.int(), ())
fn = module.get_or_insert_function(fnty, name=name)
return builder.call(fn, ())
def generate_llvm(ast_node, module=None, builder=None, target='cpu'):
"""Prints the ast as llvm code."""
if module is None:
module = ir.Module()
module = lc.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder)
printer = LLVMPrinter(module, builder, target=target)
return printer._print(ast_node)
......@@ -26,7 +53,7 @@ def generate_llvm(ast_node, module=None, builder=None):
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, *args, **kwargs):
def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
......@@ -39,6 +66,7 @@ class LLVMPrinter(Printer):
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
self.target = target
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
......@@ -163,7 +191,7 @@ class LLVMPrinter(Printer):
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype))
parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu'))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
......@@ -181,6 +209,9 @@ class LLVMPrinter(Printer):
self._print(func.body)
self.builder.ret_void()
self.fn = fn
if self.target == 'gpu':
set_cuda_kernel(fn)
return fn
def _print_Block(self, block):
......@@ -217,8 +248,10 @@ class LLVMPrinter(Printer):
# (From, to)
decision = {
(create_composite_type_from_string("int32"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
......@@ -295,13 +328,21 @@ class LLVMPrinter(Printer):
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu'))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
# Should have a list of math library functions to validate this.
# TODO function calls to libs
def _print_Conditional(self, node):
cond = self._print(node.condition_expr)
with self.builder.if_else(cond) as (then, otherwise):
with then:
self._print(node.true_block) # emit instructions for when the predicate is true
with otherwise:
self._print(node.false_block) # emit instructions for when the predicate is true
# No return!
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
......@@ -320,3 +361,19 @@ class LLVMPrinter(Printer):
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
# from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
}
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
name = f"{function_name}.{dimension}"
return self.builder.zext(_call_sreg(self.builder, name), self.integer)
import ctypes as ct
import subprocess
from functools import partial
from itertools import chain
from os.path import exists, join
import llvmlite.binding as llvm
import llvmlite.ir as ir
......@@ -98,11 +102,12 @@ def make_python_function_incomplete_params(kernel_function_node, argument_dict,
def generate_and_jit(ast):
gen = generate_llvm(ast)
target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
gen = generate_llvm(ast, target=target)
if isinstance(gen, ir.Module):
return compile_llvm(gen)
return compile_llvm(gen, target, ast)
else:
return compile_llvm(gen.module)
return compile_llvm(gen.module, target, ast)
def make_python_function(ast, argument_dict={}, func=None):
......@@ -117,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
return lambda: func(*args)
def compile_llvm(module):
jit = Jit()
def compile_llvm(module, target='cpu', ast=None):
jit = CudaJit(ast) if target == "gpu" else Jit()
jit.parse(module)
jit.optimize()
jit.compile()
......@@ -224,3 +229,95 @@ class Jit(object):
fptr = self.fptr[name]
fptr.jit = self
return fptr
# Following code more or less from numba
class CudaJit(Jit):
CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
64: 'nvptx64-nvidia-cuda'}
MACHINE_BITS = tuple.__itemsize__ * 8
data_layout = {
32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
default_data_layout = data_layout[MACHINE_BITS]
def __init__(self, ast):
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self._data_layout = self.default_data_layout[self.MACHINE_BITS]
# self._target_data = llvm.create_target_data(self._data_layout)
self.indexing = ast.indexing
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = False
pmb.slp_vectorize = False
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
pm.run(self.llvmmod)
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def parse(self, module):
llvmmod = module
llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
llvmmod.data_layout = self.default_data_layout
llvmmod.verify()
llvmmod.name = 'module'
self._llvmmod = llvm.parse_assembly(str(llvmmod))
def compile(self):
from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
import hashlib
compiler_cache = get_cache_config()['object_cache']
ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
ptx_file = ir_file.replace('.ll', '.ptx')
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
if not exists(ptx_file):
self.write_ll(ir_file)
if 'llc' in get_compiler_config():
llc_command = get_compiler_config()['llc']
else:
llc_command = get_llc_command() or 'llc'
subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
# cubin_file = ir_file.replace('.ll', '.cubin')
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
import pycuda.driver
cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file
self.cuda_module = cuda_module
def __call__(self, func, *args, **kwargs):
shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
block_and_thread_numbers = self.indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
def get_function_ptr(self, name):
return partial(self._call__, name)
import numpy as np
import pytest
from pystencils import Assignment, Field
from pystencils import Assignment, Field, show_code
from pystencils.cpu.cpujit import get_llc_command
from pystencils.llvm import create_kernel, make_python_function
from pystencils.llvm.llvmjit import generate_and_jit
......@@ -30,6 +32,39 @@ def test_jacobi_fixed_field_size():
np.testing.assert_almost_equal(error, 0.0)
@pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH")
def test_jacobi_fixed_field_size_gpu():
size = (30, 20)
import pycuda.autoinit # noqa
from pycuda.gpuarray import to_gpu
src_field_llvm = np.random.rand(*size)
src_field_py = np.copy(src_field_llvm)
dst_field_llvm = np.zeros(size)
dst_field_py = np.zeros(size)
f = Field.create_from_numpy_array("f", src_field_py)
d = Field.create_from_numpy_array("d", dst_field_py)
src_field_llvm = to_gpu(src_field_llvm)
dst_field_llvm = to_gpu(dst_field_llvm)
jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
ast = create_kernel([jacobi], target='gpu')
print(show_code(ast))
for x in range(1, size[0] - 1):
for y in range(1, size[1] - 1):
dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] +
src_field_py[x, y - 1] + src_field_py[x, y + 1])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
error = np.sum(np.abs(dst_field_py - dst_field_llvm.get()))
np.testing.assert_almost_equal(error, 0.0)
def test_jacobi_variable_field_size():
size = (3, 3, 3)
f = Field.create_generic("f", 3)
......@@ -52,3 +87,7 @@ def test_jacobi_variable_field_size():
kernel()
error = np.sum(np.abs(dst_field_py - dst_field_llvm))
np.testing.assert_almost_equal(error, 0.0)
if __name__ == "__main__":
test_jacobi_fixed_field_size_gpu()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment