Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Stephan Seitz
pystencils
Commits
0800d84a
Commit
0800d84a
authored
Dec 11, 2019
by
Stephan Seitz
Browse files
Add auto-tuning for CUDA call parameters
parent
f9ba7391
Pipeline
#20471
passed with stage
in 4 minutes and 44 seconds
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
pystencils/astnodes.py
View file @
0800d84a
...
...
@@ -178,6 +178,11 @@ class KernelFunction(Node):
self
.
instruction_set
=
None
# used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
# function that compiles the node to a Python callable, is set by the backends
self
.
_compile_function
=
compile_function
self
.
_autotune_options
=
None
@
property
def
do_cudaautotune
(
self
):
return
self
.
_autotune_options
is
not
None
@
property
def
target
(
self
):
...
...
pystencils/gpucuda/cudajit.py
View file @
0800d84a
from
functools
import
partial
import
numpy
as
np
import
pystencils
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
from
pystencils.data_types
import
StructType
from
pystencils.field
import
FieldType
...
...
@@ -77,11 +80,6 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
full_arguments
.
update
(
kwargs
)
shape
=
_check_arguments
(
parameters
,
full_arguments
)
indexing
=
kernel_function_node
.
indexing
block_and_thread_numbers
=
indexing
.
call_parameters
(
shape
)
block_and_thread_numbers
[
'block'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'block'
])
block_and_thread_numbers
[
'grid'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'grid'
])
# TODO: use texture objects:
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for
tex
in
textures
:
...
...
@@ -89,6 +87,21 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
ndarray_to_tex
(
tex_ref
,
full_arguments
[
tex
.
field
.
name
],
tex
.
address_mode
,
tex
.
filter_mode
,
tex
.
use_normalized_coordinates
,
tex
.
read_as_integer
)
args
=
_build_numpy_argument_list
(
parameters
,
full_arguments
)
indexing
=
kernel_function_node
.
indexing
if
kernel_function_node
.
do_cudaautotune
:
block_and_thread_numbers
=
(
indexing
.
autotune_call_parameters
(
partial
(
func
,
*
args
),
shape
,
kernel_function_node
.
function_name
,
tuple
((
k
,
v
.
strides
,
v
.
shape
)
for
k
,
v
in
kwargs
.
items
()
if
(
isinstance
(
v
,
pycuda
.
gpuarray
.
GPUArray
)))
+
(
str
(
pystencils
.
show_code
(
kernel_function_node
)),)))
else
:
block_and_thread_numbers
=
indexing
.
call_parameters
(
shape
)
block_and_thread_numbers
[
'block'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'block'
])
block_and_thread_numbers
[
'grid'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'grid'
])
cache
[
key
]
=
(
args
,
block_and_thread_numbers
)
cache_values
.
append
(
kwargs
)
# keep objects alive such that ids remain unique
func
(
*
args
,
**
block_and_thread_numbers
)
...
...
pystencils/gpucuda/indexing.py
View file @
0800d84a
import
abc
import
timeit
from
functools
import
partial
import
sympy
as
sp
from
sympy.core.cache
import
cacheit
from
pystencils.astnodes
import
Block
,
Conditional
from
pystencils.cache
import
disk_cache
from
pystencils.data_types
import
TypedSymbol
,
create_type
from
pystencils.integer_functions
import
div_ceil
,
div_floor
from
pystencils.slicing
import
normalize_slice
...
...
@@ -83,6 +85,60 @@ class AbstractIndexing(abc.ABC):
def
symbolic_parameters
(
self
):
"""Set of symbols required in call_parameters code"""
def
autotune_call_parameters
(
self
,
partial_function
,
call_shape
,
function_name
,
magic_hash
):
"""Autotune call parameters for a specific kernel call
Tries to find the optimum call parameters ``block``, ``grid`` for a kernel function.
Args:
partial_function: Partial PyCUDA function with parameters block and grid missing
"""
import
pycuda.driver
@
disk_cache
def
_autotune_call_parameters
(
self
,
call_shape
,
num_profile_calls
,
function_name
,
block_sizes
,
magic_hash
# needed for disk_cache
):
BIG_NUMBER
=
100000000
current_best
=
self
.
call_parameters
(
call_shape
)
best_timing
=
BIG_NUMBER
print
(
f
'Autotuning function
{
function_name
}
'
)
for
block_size
in
block_sizes
:
self
.
_block_size
=
block_size
if
isinstance
(
self
,
BlockIndexing
):
self
.
_block_size
=
(
BlockIndexing
.
permute_block_size_according_to_layout
(
self
.
_block_size
,
self
.
_layout
))
block_and_thread_numbers
=
self
.
call_parameters
(
call_shape
)
block_and_thread_numbers
[
'block'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'block'
])
block_and_thread_numbers
[
'grid'
]
=
tuple
(
int
(
i
)
for
i
in
block_and_thread_numbers
[
'grid'
])
# TODO(seitz) can we use the CUDA profiler?: pycuda.driver.start_profiler()
def
profile_call
():
for
i
in
range
(
num_profile_calls
):
partial_function
(
**
block_and_thread_numbers
)
pycuda
.
driver
.
Context
.
synchronize
()
current_time
=
timeit
.
timeit
(
profile_call
,
number
=
1
)
print
(
f
'
{
block_size
}
takes
{
current_time
}
(
{
num_profile_calls
}
)'
)
if
current_time
<
best_timing
:
best_timing
=
current_time
current_best
=
block_and_thread_numbers
print
(
f
'
{
current_best
}
is the best out of
{
self
.
_autotune_block_sizes
or
self
.
AUTOTUNE_BLOCK_SIZES
}
'
)
self
.
_block_size
=
current_best
return
current_best
return
_autotune_call_parameters
(
self
,
call_shape
,
self
.
AUTOTUNE_NUM_CALLS
,
function_name
,
self
.
_autotune_block_sizes
or
self
.
AUTOTUNE_BLOCK_SIZES
,
magic_hash
)
# -------------------------------------------- Implementations ---------------------------------------------------------
...
...
@@ -97,6 +153,8 @@ class BlockIndexing(AbstractIndexing):
gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
"""
AUTOTUNE_BLOCK_SIZES
=
((
16
,
16
,
1
),
(
32
,
1
,
1
),
(
64
,
1
,
1
),
(
96
,
1
,
1
),
(
128
,
1
,
1
),
(
160
,
1
,
1
),
(
192
,
1
,
1
),)
AUTOTUNE_NUM_CALLS
=
10
def
__init__
(
self
,
field
,
iteration_slice
,
block_size
=
(
16
,
16
,
1
),
permute_block_size_dependent_on_layout
=
True
,
compile_time_block_size
=
False
,
...
...
@@ -118,14 +176,17 @@ class BlockIndexing(AbstractIndexing):
maximum_block_size
=
tuple
(
device
.
get_attribute
(
a
)
for
a
in
(
da
.
MAX_BLOCK_DIM_X
,
da
.
MAX_BLOCK_DIM_Y
,
da
.
MAX_BLOCK_DIM_Z
))
self
.
_layout
=
field
.
layout
self
.
_maximum_block_size
=
maximum_block_size
self
.
_iterationSlice
=
normalize_slice
(
iteration_slice
,
field
.
spatial_shape
)
self
.
_dim
=
field
.
spatial_dimensions
self
.
_symbolic_shape
=
[
e
if
isinstance
(
e
,
sp
.
Basic
)
else
None
for
e
in
field
.
spatial_shape
]
self
.
_compile_time_block_size
=
compile_time_block_size
self
.
_autotune_block_sizes
=
None
@
property
def
coordinates
(
self
):
# TODO(seitz): require layout in constructor to rotate the thread indices: thread_idx == fastest
offsets
=
_get_start_from_slice
(
self
.
_iterationSlice
)
block_size
=
self
.
_block_size
if
self
.
_compile_time_block_size
else
BLOCK_DIM
coordinates
=
[
block_index
*
bs
+
thread_idx
+
off
...
...
@@ -227,6 +288,8 @@ class LineIndexing(AbstractIndexing):
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
AUTOTUNE_BLOCK_SIZES
=
((
16
,
1
,
1
),
(
32
,
1
,
1
),
(
64
,
1
,
1
),
(
96
,
1
,
1
),
(
128
,
1
,
1
),
(
160
,
1
,
1
),
(
192
,
1
,
1
),)
AUTOTUNE_NUM_CALLS
=
10
def
__init__
(
self
,
field
,
iteration_slice
):
available_indices
=
[
THREAD_IDX
[
0
]]
+
BLOCK_IDX
...
...
pystencils/kernelcreation.py
View file @
0800d84a
from
types
import
MappingProxyType
from
itertools
import
combinations
from
types
import
MappingProxyType
import
sympy
as
sp
...
...
@@ -27,7 +27,8 @@ def create_kernel(assignments,
gpu_indexing_params
=
MappingProxyType
({}),
use_textures_for_interpolation
=
True
,
cpu_prepend_optimizations
=
[],
use_auto_for_assignments
=
False
):
use_auto_for_assignments
=
False
,
autotune_cuda_callparameters
=
False
):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...
...
@@ -121,6 +122,8 @@ def create_kernel(assignments,
for
a
in
ast
.
atoms
(
SympyAssignment
):
a
.
use_auto
=
True
if
autotune_cuda_callparameters
:
ast
.
_autotune_options
=
True
return
ast
...
...
pystencils_tests/test_cudagpu.py
View file @
0800d84a
import
numpy
as
np
import
pycuda.gpuarray
as
gpuarray
import
pytest
import
sympy
as
sp
from
scipy.ndimage
import
convolve
...
...
@@ -35,6 +36,49 @@ def test_averaging_kernel():
np
.
testing
.
assert_almost_equal
(
reference
,
dst_arr
)
@
pytest
.
mark
.
parametrize
(
'use_3d'
,
(
'use_3d'
,
False
))
@
pytest
.
mark
.
parametrize
(
'use_fortran_layout'
,
(
'use_fortran_layout'
,
False
))
def
test_autotuning
(
use_fortran_layout
,
use_3d
):
print
(
f
'Use Fortan layout:
{
use_fortran_layout
}
'
)
if
use_3d
:
size
=
(
256
,
256
,
256
)
else
:
size
=
(
256
,
256
)
src_arr
=
np
.
random
.
rand
(
*
size
)
if
use_fortran_layout
:
src_arr
=
np
.
asfortranarray
(
src_arr
)
src_arr
=
add_ghost_layers
(
src_arr
)
print
(
src_arr
.
strides
)
dst_arr
=
np
.
zeros_like
(
src_arr
)
src_field
=
Field
.
create_from_numpy_array
(
'src'
,
src_arr
)
dst_field
=
Field
.
create_from_numpy_array
(
'dst'
,
dst_arr
)
if
use_3d
:
update_rules
=
(
Assignment
(
dst_field
[
0
,
0
,
0
],
(
src_field
[
0
,
0
,
1
]
+
src_field
[
0
,
0
,
-
1
]
+
src_field
[
0
,
1
,
0
]
+
src_field
[
0
,
-
1
,
0
])
/
4
),
Assignment
(
dst_field
[
0
,
0
,
0
],
(
src_field
[
1
,
0
,
0
]
+
src_field
[
-
1
,
0
,
0
]
+
src_field
[
0
,
1
,
0
]
+
src_field
[
0
,
-
1
,
0
])
/
4
))
else
:
update_rules
=
(
Assignment
(
dst_field
[
0
,
0
],
(
src_field
[
0
,
1
]
+
src_field
[
0
,
-
1
]
+
src_field
[
1
,
0
]
+
src_field
[
-
1
,
0
])
/
4
),
Assignment
(
dst_field
[
0
,
0
],
(
src_field
[
1
,
0
]
+
src_field
[
-
1
,
0
]
+
src_field
[
0
,
1
]
+
src_field
[
0
,
-
1
])
/
4
))
for
i
in
range
(
2
):
ast
=
create_cuda_kernel
(
sympy_cse_on_assignment_list
([
update_rules
[
i
]]))
ast
.
_autotune_options
=
1
kernel
=
make_python_function
(
ast
)
gpu_src_arr
=
gpuarray
.
to_gpu
(
src_arr
)
gpu_dst_arr
=
gpuarray
.
to_gpu
(
dst_arr
)
kernel
(
src
=
gpu_src_arr
,
dst
=
gpu_dst_arr
)
gpu_dst_arr
.
get
(
dst_arr
)
def
test_variable_sized_fields
():
src_field
=
Field
.
create_generic
(
'src'
,
spatial_dimensions
=
2
)
dst_field
=
Field
.
create_generic
(
'dst'
,
spatial_dimensions
=
2
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment