Commit 0800d84a authored by Stephan Seitz's avatar Stephan Seitz
Browse files

Add auto-tuning for CUDA call parameters

parent f9ba7391
Pipeline #20471 passed with stage
in 4 minutes and 44 seconds
......@@ -178,6 +178,11 @@ class KernelFunction(Node):
self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
# function that compiles the node to a Python callable, is set by the backends
self._compile_function = compile_function
self._autotune_options = None
@property
def do_cudaautotune(self):
return self._autotune_options is not None
@property
def target(self):
......
from functools import partial
import numpy as np
import pystencils
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import StructType
from pystencils.field import FieldType
......@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects:
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures:
......@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments)
indexing = kernel_function_node.indexing
if kernel_function_node.do_cudaautotune:
block_and_thread_numbers = (
indexing.autotune_call_parameters(partial(func, *args),
shape,
kernel_function_node.function_name,
tuple((k, v.strides, v.shape)
for k, v in kwargs.items()
if (isinstance(v, pycuda.gpuarray.GPUArray)))
+ (str(pystencils.show_code(kernel_function_node)),)))
else:
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers)
......
import abc
import timeit
from functools import partial
import sympy as sp
from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional
from pystencils.cache import disk_cache
from pystencils.data_types import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice
......@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC):
def symbolic_parameters(self):
"""Set of symbols required in call_parameters code"""
def autotune_call_parameters(self, partial_function, call_shape, function_name, magic_hash):
"""Autotune call parameters for a specific kernel call
Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function.
Args:
partial_function: Partial PyCUDA function with parameters block and grid missing
"""
import pycuda.driver
@disk_cache
def _autotune_call_parameters(self,
call_shape,
num_profile_calls,
function_name,
block_sizes,
magic_hash # needed for disk_cache
):
BIG_NUMBER = 100000000
current_best = self.call_parameters(call_shape)
best_timing = BIG_NUMBER
print(f'Autotuning function {function_name}')
for block_size in block_sizes:
self._block_size = block_size
if isinstance(self, BlockIndexing):
self._block_size = (
BlockIndexing.permute_block_size_according_to_layout(self._block_size, self._layout))
block_and_thread_numbers = self.call_parameters(call_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler()
def profile_call():
for i in range(num_profile_calls):
partial_function(**block_and_thread_numbers)
pycuda.driver.Context.synchronize()
current_time = timeit.timeit(profile_call, number=1)
print(f'{block_size} takes {current_time} ({num_profile_calls})')
if current_time < best_timing:
best_timing = current_time
current_best = block_and_thread_numbers
print(f'{current_best} is the best out of {self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES}')
self._block_size = current_best
return current_best
return _autotune_call_parameters(self,
call_shape,
self.AUTOTUNE_NUM_CALLS,
function_name,
self._autotune_block_sizes or self.AUTOTUNE_BLOCK_SIZES,
magic_hash)
# -------------------------------------------- Implementations ---------------------------------------------------------
......@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing):
gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
"""
AUTOTUNE_BLOCK_SIZES = ((16, 16, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
AUTOTUNE_NUM_CALLS = 10
def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
......@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing):
maximum_block_size = tuple(device.get_attribute(a)
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
self._layout = field.layout
self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size
self._autotune_block_sizes = None
@property
def coordinates(self):
# TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest
offsets = _get_start_from_slice(self._iterationSlice)
block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
coordinates = [block_index * bs + thread_idx + off
......@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing):
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
AUTOTUNE_BLOCK_SIZES = ((16, 1, 1), (32, 1, 1), (64, 1, 1), (96, 1, 1), (128, 1, 1), (160, 1, 1), (192, 1, 1),)
AUTOTUNE_NUM_CALLS = 10
def __init__(self, field, iteration_slice):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX
......
from types import MappingProxyType
from itertools import combinations
from types import MappingProxyType
import sympy as sp
......@@ -27,7 +27,8 @@ def create_kernel(assignments,
gpu_indexing_params=MappingProxyType({}),
use_textures_for_interpolation=True,
cpu_prepend_optimizations=[],
use_auto_for_assignments=False):
use_auto_for_assignments=False,
autotune_cuda_callparameters=False):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
......@@ -121,6 +122,8 @@ def create_kernel(assignments,
for a in ast.atoms(SympyAssignment):
a.use_auto = True
if autotune_cuda_callparameters:
ast._autotune_options = True
return ast
......
import numpy as np
import pycuda.gpuarray as gpuarray
import pytest
import sympy as sp
from scipy.ndimage import convolve
......@@ -35,6 +36,49 @@ def test_averaging_kernel():
np.testing.assert_almost_equal(reference, dst_arr)
@pytest.mark.parametrize('use_3d', ('use_3d', False))
@pytest.mark.parametrize('use_fortran_layout', ('use_fortran_layout', False))
def test_autotuning(use_fortran_layout, use_3d):
print(f'Use Fortan layout: {use_fortran_layout}')
if use_3d:
size = (256, 256, 256)
else:
size = (256, 256)
src_arr = np.random.rand(*size)
if use_fortran_layout:
src_arr = np.asfortranarray(src_arr)
src_arr = add_ghost_layers(src_arr)
print(src_arr.strides)
dst_arr = np.zeros_like(src_arr)
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
if use_3d:
update_rules = (Assignment(dst_field[0, 0, 0],
(src_field[0, 0, 1] + src_field[0, 0, -1] + src_field[0, 1, 0] + src_field[0, -1, 0])
/ 4),
Assignment(dst_field[0, 0, 0],
(src_field[1, 0, 0] + src_field[-1, 0, 0] + src_field[0, 1, 0] + src_field[0, -1, 0])
/ 4))
else:
update_rules = (Assignment(dst_field[0, 0],
(src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0])
/ 4),
Assignment(dst_field[0, 0],
(src_field[1, 0] + src_field[-1, 0] + src_field[0, 1] + src_field[0, -1])
/ 4))
for i in range(2):
ast = create_cuda_kernel(sympy_cse_on_assignment_list([update_rules[i]]))
ast._autotune_options = 1
kernel = make_python_function(ast)
gpu_src_arr = gpuarray.to_gpu(src_arr)
gpu_dst_arr = gpuarray.to_gpu(dst_arr)
kernel(src=gpu_src_arr, dst=gpu_dst_arr)
gpu_dst_arr.get(dst_arr)
def test_variable_sized_fields():
src_field = Field.create_generic('src', spatial_dimensions=2)
dst_field = Field.create_generic('dst', spatial_dimensions=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment