Skip to content
Snippets Groups Projects
Commit 145c5264 authored by Markus Holzer's avatar Markus Holzer Committed by Michael Kuron
Browse files

Fix indexing for AMD GPUs

parent b9d9c5cc
1 merge request!335Fix indexing for AMD GPUs
import abc
from functools import partial
import math
import sympy as sp
from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Block, Conditional
from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor
......@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing):
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
"""
def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)):
maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
......@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing):
self._block_size = block_size
if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits
import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
else:
da = cp.cuda.Device(device_number).attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property
def cuda_indices(self):
......@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing):
def iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape)
@staticmethod
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
"""Shrinks the block_size if there are too many registers used per multiprocessor.
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function.
:returns smaller block_size if too many registers are used.
Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
"""
import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
available_registers_per_mp = da.get("MaxRegistersPerMultiprocessor")
block = block_size
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True:
num_threads = 1
for t in block:
num_threads *= t
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
required_registers = math.prod(result) * required_registers_per_thread
if required_registers <= max_registers_per_block:
return result
else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
assert block[largest_grid_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2
largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert result[largest_list_entry_idx] >= 2
result[largest_list_entry_idx] //= 2
@staticmethod
def permute_block_size_according_to_layout(block_size, layout):
......
......@@ -2,6 +2,7 @@ from typing import Union
import numpy as np
import pystencils
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol
......@@ -10,7 +11,7 @@ from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend
from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing
from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.transformations import (
get_base_buffer_index, get_common_field, parse_base_pointer_info,
......@@ -21,6 +22,8 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
config: CreateKernelConfig):
function_name = config.function_name
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers
......@@ -120,6 +123,8 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
index_fields = config.index_fields
function_name = config.function_name
coordinate_names = config.coordinate_names
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
fields_written = assignments.bound_fields
......
......@@ -3,6 +3,7 @@ import cupy as cp
import sympy as sp
from scipy.ndimage import convolve
import pystencils
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list
......@@ -162,8 +163,19 @@ def test_block_indexing():
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000)
registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
else:
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
assert np.prod(blocks) * registers_per_thread < max_registers_per_block
assert sum(blocks) < sum([1024, 1024, 1])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment