Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jonas Plewinski
pystencils
Commits
fbf355ec
Commit
fbf355ec
authored
Mar 29, 2017
by
Martin Bauer
Browse files
pystencils: CUDA kernels can now also iterate over slice
parent
882c84e6
Changes
2
Hide whitespace changes
Inline
Side-by-side
gpucuda/indexing.py
View file @
fbf355ec
...
...
@@ -5,6 +5,7 @@ import pycuda.driver as cuda
import
pycuda.autoinit
from
pystencils.astnodes
import
Conditional
,
Block
from
pystencils.slicing
import
normalizeSlice
BLOCK_IDX
=
list
(
sp
.
symbols
(
"blockIdx.x blockIdx.y blockIdx.z"
))
THREAD_IDX
=
list
(
sp
.
symbols
(
"threadIdx.x threadIdx.y threadIdx.z"
))
...
...
@@ -15,8 +16,7 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
a pystencils field and the number of ghost layers as well as further optional parameters that must have default
values.
a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
"""
@
abc
.
abstractproperty
...
...
@@ -55,12 +55,12 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
class
BlockIndexing
(
AbstractIndexing
):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
def
__init__
(
self
,
field
,
ghostLayers
,
blockSize
=
(
256
,
8
,
1
),
permuteBlockSizeDependentOnLayout
=
True
):
def
__init__
(
self
,
field
,
iterationSlice
=
None
,
blockSize
=
(
256
,
8
,
1
),
permuteBlockSizeDependentOnLayout
=
True
):
"""
Creates
:param field: pystencils field (common to all Indexing classes)
:param ghostLayers: number of ghost layers (common to all Indexing classes)
:param blockSize: size of a CUDA block
:param iterationSlice: slice that defines rectangular subarea which is iterated over
:param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
gets the largest amount of threads
"""
...
...
@@ -73,20 +73,21 @@ class BlockIndexing(AbstractIndexing):
blockSize
=
self
.
limitBlockSizeToDeviceMaximum
(
blockSize
)
self
.
_blockSize
=
blockSize
self
.
_coordinates
=
[
blockIndex
*
bs
+
threadIndex
+
gl
[
0
]
for
blockIndex
,
bs
,
threadIndex
,
gl
in
zip
(
BLOCK_IDX
,
blockSize
,
THREAD_IDX
,
ghostLayers
)]
self
.
_iterationSlice
=
iterationSlice
offsets
=
_getStartFromSlice
(
self
.
_iterationSlice
)
self
.
_coordinates
=
[
blockIndex
*
bs
+
threadIndex
+
off
for
blockIndex
,
bs
,
threadIndex
,
off
in
zip
(
BLOCK_IDX
,
blockSize
,
THREAD_IDX
,
offsets
)]
self
.
_coordinates
=
self
.
_coordinates
[:
field
.
spatialDimensions
]
self
.
_ghostLayers
=
ghostLayers
@
property
def
coordinates
(
self
):
return
self
.
_coordinates
def
getCallParameters
(
self
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
[
s
-
(
gl
[
0
]
+
gl
[
1
])
for
s
,
gl
in
zip
(
arrShape
[:
dim
],
self
.
_ghostLayers
)]
grid
=
tuple
(
math
.
ceil
(
length
/
blockSize
)
for
length
,
blockSize
in
zip
(
arrShape
,
self
.
_blockSize
))
widths
=
[
end
-
start
for
start
,
end
in
zip
(
_getStartFromSlice
(
self
.
_iterationSlice
),
_getEndFromSlice
(
self
.
_iterationSlice
,
arrShape
)
)]
grid
=
tuple
(
math
.
ceil
(
length
/
blockSize
)
for
length
,
blockSize
in
zip
(
widths
,
self
.
_blockSize
))
extendBs
=
(
1
,)
*
(
3
-
len
(
self
.
_blockSize
))
extendGr
=
(
1
,)
*
(
3
-
len
(
grid
))
return
{
'block'
:
self
.
_blockSize
+
extendBs
,
...
...
@@ -95,8 +96,8 @@ class BlockIndexing(AbstractIndexing):
def
guard
(
self
,
kernelContent
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
arrShape
[:
dim
]
conditions
=
[
c
<
shapeComponent
-
gl
[
1
]
for
c
,
shapeComponent
,
gl
in
zip
(
self
.
_coordinates
,
arrShape
,
self
.
_ghostLayers
)]
conditions
=
[
c
<
end
for
c
,
end
in
zip
(
self
.
_coordinates
,
_getEndFromSlice
(
self
.
_iterationSlice
,
arrShape
)
)]
condition
=
conditions
[
0
]
for
c
in
conditions
[
1
:]:
condition
=
sp
.
And
(
condition
,
c
)
...
...
@@ -178,7 +179,7 @@ class LineIndexing(AbstractIndexing):
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
def
__init__
(
self
,
field
,
ghostLayers
):
def
__init__
(
self
,
field
,
iterationSlice
=
None
):
availableIndices
=
[
THREAD_IDX
[
0
]]
+
BLOCK_IDX
if
field
.
spatialDimensions
>
4
:
raise
NotImplementedError
(
"This indexing scheme supports at most 4 spatial dimensions"
)
...
...
@@ -188,24 +189,51 @@ class LineIndexing(AbstractIndexing):
fastestCoordinate
=
field
.
layout
[
-
1
]
coordinates
[
0
],
coordinates
[
fastestCoordinate
]
=
coordinates
[
fastestCoordinate
],
coordinates
[
0
]
self
.
_coordiantesNoGhostLayer
=
coordinates
self
.
_coordinates
=
[
i
+
gl
[
0
]
for
i
,
gl
in
zip
(
coordinates
,
ghostLayers
)]
self
.
_ghostLayers
=
ghostLayers
self
.
_coordinates
=
coordinates
self
.
_iterationSlice
=
iterationSlice
@
property
def
coordinates
(
self
):
return
self
.
_coordinates
return
[
i
+
offset
for
i
,
offset
in
zip
(
self
.
_coordinates
,
_getStartFromSlice
(
self
.
_iterationSlice
))]
def
getCallParameters
(
self
,
arrShape
):
widths
=
[
end
-
start
for
start
,
end
in
zip
(
_getStartFromSlice
(
self
.
_iterationSlice
),
_getEndFromSlice
(
self
.
_iterationSlice
,
arrShape
))]
def
getShapeOfCudaIdx
(
cudaIdx
):
if
cudaIdx
not
in
self
.
_coordi
a
ntes
NoGhostLayer
:
if
cudaIdx
not
in
self
.
_coordin
a
tes
:
return
1
else
:
idx
=
self
.
_coordi
a
ntes
NoGhostLayer
.
index
(
cudaIdx
)
return
arrShape
[
idx
]
-
(
self
.
_ghostLayers
[
idx
][
0
]
+
self
.
_ghostLayer
s
[
idx
]
[
1
])
idx
=
self
.
_coordin
a
tes
.
index
(
cudaIdx
)
return
width
s
[
idx
]
return
{
'block'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
THREAD_IDX
]),
'grid'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
BLOCK_IDX
])}
def
guard
(
self
,
kernelContent
,
arrShape
):
return
kernelContent
# -------------------------------------- Helper functions --------------------------------------------------------------
def
_getStartFromSlice
(
iterationSlice
):
res
=
[]
for
sliceComponent
in
iterationSlice
:
if
type
(
sliceComponent
)
is
slice
:
res
.
append
(
sliceComponent
.
start
if
sliceComponent
.
start
is
not
None
else
0
)
else
:
assert
isinstance
(
sliceComponent
,
int
)
res
.
append
(
sliceComponent
)
return
res
def
_getEndFromSlice
(
iterationSlice
,
arrShape
):
iterSlice
=
normalizeSlice
(
iterationSlice
,
arrShape
)
res
=
[]
for
sliceComponent
in
iterSlice
:
if
type
(
sliceComponent
)
is
slice
:
res
.
append
(
sliceComponent
.
stop
)
else
:
assert
isinstance
(
sliceComponent
,
int
)
res
.
append
(
sliceComponent
+
1
)
return
res
\ No newline at end of file
gpucuda/kernelcreation.py
View file @
fbf355ec
...
...
@@ -5,8 +5,8 @@ from pystencils.types import TypedSymbol, BasicType, StructType
from
pystencils
import
Field
def
createCUDAKernel
(
listOfEquations
,
functionName
=
"kernel"
,
typeForSymbol
=
None
,
indexingCreator
=
BlockIndexing
,
ghostLayers
=
None
):
def
createCUDAKernel
(
listOfEquations
,
functionName
=
"kernel"
,
typeForSymbol
=
None
,
indexingCreator
=
BlockIndexing
,
iterationSlice
=
None
,
ghostLayers
=
None
):
fieldsRead
,
fieldsWritten
,
assignments
=
typeAllEquations
(
listOfEquations
,
typeForSymbol
)
allFields
=
fieldsRead
.
union
(
fieldsWritten
)
readOnlyFields
=
set
([
f
.
name
for
f
in
fieldsRead
-
fieldsWritten
])
...
...
@@ -16,13 +16,22 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
fieldAccesses
.
update
(
eq
.
atoms
(
Field
.
Access
))
commonShape
=
getCommonShape
(
allFields
)
if
ghostLayers
is
None
:
requiredGhostLayers
=
max
([
fa
.
requiredGhostLayers
for
fa
in
fieldAccesses
])
ghostLayers
=
[(
requiredGhostLayers
,
requiredGhostLayers
)]
*
len
(
commonShape
)
if
isinstance
(
ghostLayers
,
int
):
ghostLayers
=
[(
ghostLayers
,
ghostLayers
)]
*
len
(
commonShape
)
indexing
=
indexingCreator
(
field
=
list
(
fieldsRead
)[
0
],
ghostLayers
=
ghostLayers
)
if
iterationSlice
is
None
:
# determine iteration slice from ghost layers
if
ghostLayers
is
None
:
# determine required number of ghost layers from field access
requiredGhostLayers
=
max
([
fa
.
requiredGhostLayers
for
fa
in
fieldAccesses
])
ghostLayers
=
[(
requiredGhostLayers
,
requiredGhostLayers
)]
*
len
(
commonShape
)
iterationSlice
=
[]
if
isinstance
(
ghostLayers
,
int
):
for
i
in
range
(
len
(
commonShape
)):
iterationSlice
.
append
(
slice
(
ghostLayers
[
i
],
-
ghostLayers
[
i
]))
else
:
for
i
in
range
(
len
(
commonShape
)):
iterationSlice
.
append
(
slice
(
ghostLayers
[
i
][
0
],
-
ghostLayers
[
i
][
1
]))
indexing
=
indexingCreator
(
field
=
list
(
fieldsRead
)[
0
],
iterationSlice
=
iterationSlice
)
block
=
Block
(
assignments
)
block
=
indexing
.
guard
(
block
,
commonShape
)
...
...
@@ -71,7 +80,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
coordinateTypedSymbols
=
[
eq
.
lhs
for
eq
in
coordinateSymbolAssignments
]
idxField
=
list
(
indexFields
)[
0
]
indexing
=
indexingCreator
(
field
=
idxField
,
ghostLayers
=
[(
0
,
0
)]
*
len
(
idxField
.
shape
))
indexing
=
indexingCreator
(
field
=
idxField
,
iterationSlice
=
[
slice
(
None
,
None
,
None
)]
*
len
(
idxField
.
s
patialS
hape
))
functionBody
=
Block
(
coordinateSymbolAssignments
+
assignments
)
functionBody
=
indexing
.
guard
(
functionBody
,
getCommonShape
(
indexFields
))
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment