Commit 64734915 authored by Martin Bauer's avatar Martin Bauer

Merge branch 'opencl-backend' into 'master'

Basic support for OpenCL (experimental)

See merge request pycodegen/pystencils!29
parents e7a8d3ce ba5d035e
Pipeline #17430 passed with stages
in 4 minutes and 9 seconds
......@@ -52,6 +52,7 @@ Without `[interactive]` you get a minimal version with very little dependencies.
All options:
- `gpu`: use this if an Nvidia GPU is available and CUDA is installed
- `pyopencl`: basic OpenCL support (experimental)
- `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl
- `bench_db`: functionality to store benchmark result in object databases
- `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc.
......
......@@ -707,8 +707,7 @@ class DestructuringBindingsForFieldClass(Node):
corresponding_field_names = {s.field_name for s in undefined_field_symbols if hasattr(s, 'field_name')}
corresponding_field_names |= {s.field_names[0] for s in undefined_field_symbols if hasattr(s, 'field_names')}
return {TypedSymbol(f, self.CLASS_NAME_TEMPLATE.format(dtype=field_map[f].dtype, ndim=field_map[f].ndim) + '&')
for f in corresponding_field_names} | \
(self.body.undefined_symbols - undefined_field_symbols)
for f in corresponding_field_names} | (self.body.undefined_symbols - undefined_field_symbols)
def subs(self, subs_dict) -> None:
"""Inplace! substitute, similar to sympy's but modifies the AST inplace."""
......
......@@ -59,6 +59,9 @@ def generate_c(ast_node: Node, signature_only: bool = False, dialect='c', custom
elif dialect == 'cuda':
from pystencils.backends.cuda_backend import CudaBackend
printer = CudaBackend(signature_only=signature_only)
elif dialect == 'opencl':
from pystencils.backends.opencl_backend import OpenClBackend
printer = OpenClBackend(signature_only=signature_only)
else:
raise ValueError("Unknown dialect: " + str(dialect))
code = printer(ast_node)
......@@ -166,14 +169,19 @@ class CBackend:
return result
def _print(self, node):
if isinstance(node, str):
return node
for cls in type(node).__mro__:
method_name = "_print_" + cls.__name__
if hasattr(self, method_name):
return getattr(self, method_name)(node)
raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__)
def _print_Type(self, node):
return str(node)
def _print_KernelFunction(self, node):
function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()]
function_arguments = ["%s %s" % (self._print(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()]
launch_bounds = ""
if self._dialect == 'cuda':
max_threads = node.indexing.max_threads_per_block()
......@@ -208,7 +216,11 @@ class CBackend:
def _print_SympyAssignment(self, node):
if node.is_declaration:
data_type = "const " + str(node.lhs.dtype) + " " if node.is_const else str(node.lhs.dtype) + " "
if node.is_const:
prefix = 'const '
else:
prefix = ''
data_type = prefix + self._print(node.lhs.dtype) + " "
return "%s%s = %s;" % (data_type, self.sympy_printer.doprint(node.lhs),
self.sympy_printer.doprint(node.rhs))
else:
......@@ -277,10 +289,10 @@ class CBackend:
]
destructuring_bindings.sort() # only for code aesthetics
return "{\n" + self._indent + \
("\n" + self._indent).join(destructuring_bindings) + \
"\n" + self._indent + \
("\n" + self._indent).join(self._print(node.body).splitlines()) + \
"\n}"
("\n" + self._indent).join(destructuring_bindings) + \
"\n" + self._indent + \
("\n" + self._indent).join(self._print(node.body).splitlines()) + \
"\n}"
# ------------------------------------------ Helper function & classes -------------------------------------------------
......
acos
acosh
acospi
asin
asinh
asinpi
atan
atan2
atanh
atanpi
atan2pi
cbrt
ceil
copysign
cos
cosh
cospi
erfc
erf
exp
exp2
exp10
expm1
fabs
fdim
floor
fma
fmax
fmax
fmin45
fmin
fmod
fract
frexp
hypot
ilogb
ldexp
lgamma
lgamma_r
log
log2
log10
log1p
logb
mad
maxmag
minmag
modf
nextafter
pow
pown
powr
remquo
intn
remquo
rint
rootn
rootn
round
rsqrt
sin
sincos
sinh
sinpi
sqrt
tan
tanh
tanpi
tgamma
trunc
half_cos
half_divide
half_exp
half_exp2
half_exp10
half_log
half_log2
half_log10
half_powr
half_recip
half_rsqrt
half_sin
half_sqrt
half_tan
native_cos
native_divide
native_exp
native_exp2
native_exp10
native_log
native_log2
native_log10
native_powr
native_recip
native_rsqrt
native_sin
native_sqrt
native_tan
from os.path import dirname, join
import pystencils.data_types
from pystencils.astnodes import Node
from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
from pystencils.backends.cuda_backend import CudaBackend, CudaSympyPrinter
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
with open(join(dirname(__file__), 'opencl1.1_known_functions.txt')) as f:
lines = f.readlines()
OPENCL_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_opencl(astnode: Node, signature_only: bool = False) -> str:
"""Prints an abstract syntax tree node (made for target 'gpu') as OpenCL code.
Args:
astnode: KernelFunction node to generate code for
signature_only: if True only the signature is printed
Returns:
C-like code for the ast node and its descendants
"""
return generate_c(astnode, signature_only, dialect='opencl')
class OpenClBackend(CudaBackend):
def __init__(self,
sympy_printer=None,
signature_only=False):
if not sympy_printer:
sympy_printer = OpenClSympyPrinter()
super().__init__(sympy_printer, signature_only)
self._dialect = 'opencl'
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadBlockSynchronization(self, node):
raise NotImplementedError()
def _print_TextureDeclaration(self, node):
raise NotImplementedError()
class OpenClSympyPrinter(CudaSympyPrinter):
language = "OpenCL"
DIMENSION_MAPPING = {
'x': '0',
'y': '1',
'z': '2'
}
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'get_group_id',
'threadIdx': 'get_local_id',
'blockDim': 'get_local_size',
'gridDim': 'get_global_size'
}
def __init__(self):
CustomSympyPrinter.__init__(self)
self.known_functions = OPENCL_KNOWN_FUNCTIONS
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
dimension = self.DIMENSION_MAPPING[dimension]
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
return f"{function_name}({dimension})"
def _print_TextureAccess(self, node):
raise NotImplementedError()
# For math functions, OpenCL is more similar to the C++ printer CustomSympyPrinter
# since built-in math functions are generic.
# In CUDA, you have to differentiate between `sin` and `sinf`
_print_math_func = CustomSympyPrinter._print_math_func
_print_Pow = CustomSympyPrinter._print_Pow
def _print_Function(self, expr):
if isinstance(expr, fast_division):
return "native_divide(%s, %s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_sqrt):
return "native_sqrt(%s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_inv_sqrt):
return "native_rsqrt(%s)" % tuple(self._print(a) for a in expr.args)
return CustomSympyPrinter._print_Function(self, expr)
import abc
from functools import partial
from typing import Tuple # noqa
import sympy as sp
from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional
from pystencils.data_types import TypedSymbol, create_type
......@@ -10,10 +10,24 @@ from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice
from pystencils.sympyextensions import is_integer_sequence, prod
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
class ThreadIndexingSymbol(TypedSymbol):
def __new__(cls, *args, **kwds):
obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds)
return obj
def __new_stage2__(cls, name, dtype, *args, **kwargs):
obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs)
return obj
__xnew__ = staticmethod(__new_stage2__)
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
class AbstractIndexing(abc.ABC):
......@@ -69,6 +83,7 @@ class AbstractIndexing(abc.ABC):
def symbolic_parameters(self):
"""Set of symbols required in call_parameters code"""
# -------------------------------------------- Implementations ---------------------------------------------------------
......@@ -82,6 +97,7 @@ class BlockIndexing(AbstractIndexing):
gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
"""
def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)):
......
#ifndef OPENCL_STDINT
#define OPENCL_STDINT
typedef unsigned int uint;
typedef unsigned int uint_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
#endif
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
USE_FAST_MATH = True
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
assert opencl_ctx, "No valid OpenCL context"
assert opencl_queue, "No valid OpenCL queue"
if argument_dict is None:
argument_dict = {}
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only")
options.append("-I \"" + get_pystencils_include_path() + "\"")
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
import numpy as np
import pytest
import sympy as sp
import pystencils
from pystencils.backends.cuda_backend import CudaBackend
from pystencils.backends.opencl_backend import OpenClBackend
from pystencils.opencl.opencljit import make_python_function
try:
import pyopencl as cl
HAS_OPENCL = True
except Exception:
HAS_OPENCL = False
def test_print_opencl():
z, y, x = pystencils.fields("z, y, x: [2d]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
assert "__global double * RESTRICT const _data_x" in str(opencl_code)
assert "__global double * RESTRICT" in str(opencl_code)
assert "get_local_id(0)" in str(opencl_code)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit_fixed_size():
z, y, x = pystencils.fields("z, y, x: [20,30]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit():
z, y, x = pystencils.fields("z, y, x: [2d]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit_with_parameter():
z, y, x = pystencils.fields("z, y, x: [2d]")
a = sp.Symbol('a')
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + a
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z, a=5.)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z, a=5.)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
if __name__ == '__main__':
test_opencl_jit()
......@@ -88,7 +88,10 @@ setup(name='pystencils',
url='https://i10git.cs.fau.de/pycodegen/pystencils/',
packages=['pystencils'] + ['pystencils.' + s for s in find_packages('pystencils')],
install_requires=['sympy>=1.1', 'numpy', 'appdirs', 'joblib'],
package_data={'pystencils': ['include/*.h', 'backends/cuda_known_functions.txt']},
package_data={'pystencils': ['include/*.h',
'backends/cuda_known_functions.txt',
'backends/opencl1.1_known_functions.txt']},
ext_modules=cython_extensions("pystencils.boundaries.createindexlistcython"),
classifiers=[
'Development Status :: 4 - Beta',
......@@ -106,6 +109,7 @@ setup(name='pystencils',
},
extras_require={
'gpu': ['pycuda'],
'opencl': ['pyopencl'],
'alltrafos': ['islpy', 'py-cpuinfo'],
'bench_db': ['blitzdb', 'pymongo', 'pandas'],
'interactive': ['matplotlib', 'ipy_table', 'imageio', 'jupyter', 'pyevtk'],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment