Skip to content
Snippets Groups Projects

Extend Support for CUDA and HIP kernel invocations

Merged Frederik Hennig requested to merge fhennig/cuda-invoke into master
All threads resolved!
Viewing commit b1b71c32
Show latest version
4 files
+ 146
62
Preferences
Compare changes
Files
4
@@ -29,31 +29,24 @@ class SfgGpuComposer(SfgComposerMixIn):
self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
def use_cuda(self):
"""Instruct the GPU composer to use the CUDA runtime API"""
from ..lang.gpu import CudaAPI
if self._gpu_api_provider is not None and not isinstance(
self._gpu_api_provider, CudaAPI
):
raise ValueError(
"Cannot select CUDA GPU API since another API was already chosen"
)
self._gpu_api_provider = CudaAPI()
def use_hip(self):
"""Instruct the GPU composer to use the HIP runtime API"""
from ..lang.gpu import HipAPI
if self._gpu_api_provider is not None and not isinstance(
self._gpu_api_provider, HipAPI
):
raise ValueError(
"Cannot select HIP GPU API since another API was already chosen"
)
self._gpu_api_provider = HipAPI()
@property
def gpu_api(self) -> ProvidesGpuRuntimeAPI:
"""GPU runtime API wrapper currently used by this GPU composer.
Raises:
AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`)
"""
if self._gpu_api_provider is None:
raise AttributeError(
"No GPU API was selected - call `use_cuda()` or `use_hip()` first."
@@ -104,7 +97,8 @@ class SfgGpuComposer(SfgComposerMixIn):
This signature accepts kernels generated with an indexing scheme that permits a user-defined
blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
The grid size is calculated automatically.
The grid size is calculated automatically by dividing the number of work items in each
dimension by the block size, rounding up.
"""
def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
@@ -144,6 +138,9 @@ class SfgGpuComposer(SfgComposerMixIn):
stream=stmt_stream,
)
def to_uint32_t(expr: AugExpr) -> AugExpr:
return AugExpr("uint32_t").format("uint32_t({})", expr)
match launch_config:
case ManualLaunchConfiguration():
grid_size = kwargs["grid_size"]
@@ -153,12 +150,14 @@ class SfgGpuComposer(SfgComposerMixIn):
case AutomaticLaunchConfiguration():
grid_size_entries = [
self.expr_from_lambda(gs) for gs in launch_config._grid_size
to_uint32_t(self.expr_from_lambda(gs))
for gs in launch_config._grid_size
]
grid_size_var = dim3(const=True).var("__grid_size")
block_size_entries = [
self.expr_from_lambda(bs) for bs in launch_config._block_size
to_uint32_t(self.expr_from_lambda(bs))
for bs in launch_config._block_size
]
block_size_var = dim3(const=True).var("__block_size")
@@ -197,27 +196,16 @@ class SfgGpuComposer(SfgComposerMixIn):
"uint32_t", "uint32_t", "uint32_t", const=True
).var("__work_items")
def _min(a: ExprLike, b: ExprLike):
return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
def _div_ceil(a: ExprLike, b: ExprLike):
return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
reduced_block_size_entries = [
_min(work_items_var.get(i), bs)
for i, bs in enumerate(
[block_size_var.x, block_size_var.y, block_size_var.z]
)
]
reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
grid_size_entries = [
_div_ceil(work_items_var.get(i), bs)
for i, bs in enumerate(
[
reduced_block_size_var.x,
reduced_block_size_var.y,
reduced_block_size_var.z,
block_size_var.x,
block_size_var.y,
block_size_var.z,
]
)
]
@@ -226,9 +214,8 @@ class SfgGpuComposer(SfgComposerMixIn):
nodes = [
self.init(block_size_var)(*block_size_init_args),
self.init(work_items_var)(*work_items_entries),
self.init(reduced_block_size_var)(*reduced_block_size_entries),
self.init(grid_size_var)(*grid_size_entries),
_render_invocation(grid_size_var, reduced_block_size_var),
_render_invocation(grid_size_var, block_size_var),
]
return SfgBlock(SfgSequence(nodes))