Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Tom Harke
pystencils
Commits
771a9b22
Commit
771a9b22
authored
Mar 21, 2017
by
Martin Bauer
Browse files
Caching for jitted cpu and gpu kernels (big speedup for small work sizes)
parent
ff641ec9
Changes
5
Hide whitespace changes
Inline
Side-by-side
cpu/cpujit.py
View file @
771a9b22
...
...
@@ -428,12 +428,19 @@ def makePythonFunctionIncompleteParams(kernelFunctionNode, argumentDict):
func
.
restype
=
None
parameters
=
kernelFunctionNode
.
parameters
cache
=
{}
def
wrapper
(
**
kwargs
):
from
copy
import
copy
fullArguments
=
copy
(
argumentDict
)
fullArguments
.
update
(
kwargs
)
args
=
buildCTypeArgumentList
(
parameters
,
fullArguments
)
func
(
*
args
)
key
=
hash
(
tuple
((
k
,
id
(
v
))
for
k
,
v
in
kwargs
.
items
()))
try
:
args
=
cache
[
key
]
func
(
*
args
)
except
KeyError
:
fullArguments
=
argumentDict
.
copy
()
fullArguments
.
update
(
kwargs
)
args
=
buildCTypeArgumentList
(
parameters
,
fullArguments
)
cache
[
key
]
=
args
func
(
*
args
)
return
wrapper
field.py
View file @
771a9b22
...
...
@@ -95,7 +95,7 @@ class Field(object):
if
spatialDimensions
<
1
:
raise
ValueError
(
"Too many index dimensions. At least one spatial dimension required"
)
fullLayout
=
getLayout
FromNumpy
Array
(
npArray
)
fullLayout
=
getLayout
Of
Array
(
npArray
)
spatialLayout
=
tuple
([
i
for
i
in
fullLayout
if
i
<
spatialDimensions
])
assert
len
(
spatialLayout
)
==
spatialDimensions
...
...
@@ -349,11 +349,11 @@ def extractCommonSubexpressions(equations):
return
equations
def
getLayout
FromNumpy
Array
(
arr
,
indexDimensionIds
=
[]):
def
getLayout
Of
Array
(
arr
,
indexDimensionIds
=
[]):
"""
Returns a list indicating the memory layout (linearization order) of the numpy array.
Example:
>>> getLayout
FromNumpy
Array(np.zeros([3,3,3]))
>>> getLayout
Of
Array(np.zeros([3,3,3]))
(0, 1, 2)
In this example the loop over the zeroth coordinate should be the outermost loop,
...
...
@@ -369,6 +369,37 @@ def getLayoutFromNumpyArray(arr, indexDimensionIds=[]):
return
normalizeLayout
(
result
)
def
createNumpyArrayWithLayout
(
shape
,
layout
):
"""
Creates a numpy array with
:param shape: shape of the resulting array
:param layout: layout as tuple, where the coordinates are ordered from slow to fast
>>> res = createNumpyArrayWithLayout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
>>> res.shape
(2, 3, 4, 5)
>>> getLayoutOfArray(res)
(3, 2, 0, 1)
"""
assert
set
(
layout
)
==
set
(
range
(
len
(
shape
))),
"Wrong layout descriptor"
currentLayout
=
list
(
range
(
len
(
shape
)))
swaps
=
[]
for
i
in
range
(
len
(
layout
)):
if
currentLayout
[
i
]
!=
layout
[
i
]:
indexToSwapWith
=
currentLayout
.
index
(
layout
[
i
])
swaps
.
append
((
i
,
indexToSwapWith
))
currentLayout
[
i
],
currentLayout
[
indexToSwapWith
]
=
currentLayout
[
indexToSwapWith
],
currentLayout
[
i
]
assert
tuple
(
currentLayout
)
==
tuple
(
layout
)
shape
=
list
(
shape
)
for
a
,
b
in
swaps
:
shape
[
a
],
shape
[
b
]
=
shape
[
b
],
shape
[
a
]
res
=
np
.
empty
(
shape
,
order
=
'c'
)
for
a
,
b
in
reversed
(
swaps
):
res
=
res
.
swapaxes
(
a
,
b
)
return
res
def
normalizeLayout
(
layout
):
"""Takes a layout tuple and subtracts the minimum from all entries"""
minEntry
=
min
(
layout
)
...
...
@@ -382,14 +413,12 @@ def computeStrides(shape, layout):
:param layout: layout specification as tuple
:return: strides in elements, not in bytes
"""
layout
=
list
(
reversed
(
layout
))
N
=
len
(
shape
)
assert
len
(
layout
)
==
N
assert
len
(
set
(
layout
))
==
N
strides
=
[
0
]
*
N
product
=
1
for
i
in
range
(
N
):
j
=
layout
.
index
(
i
)
for
j
in
reversed
(
layout
):
strides
[
j
]
=
product
product
*=
shape
[
j
]
return
tuple
(
strides
)
...
...
gpucuda/cudajit.py
View file @
771a9b22
...
...
@@ -28,16 +28,24 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
parameters
=
kernelFunctionNode
.
parameters
cache
=
{}
def
wrapper
(
**
kwargs
):
from
copy
import
copy
fullArguments
=
copy
(
argumentDict
)
fullArguments
.
update
(
kwargs
)
shape
=
_checkArguments
(
parameters
,
fullArguments
)
indexing
=
kernelFunctionNode
.
indexing
dictWithBlockAndThreadNumbers
=
indexing
.
getCallParameters
(
shape
)
args
=
_buildNumpyArgumentList
(
parameters
,
fullArguments
)
func
(
*
args
,
**
dictWithBlockAndThreadNumbers
)
key
=
hash
(
tuple
((
k
,
id
(
v
))
for
k
,
v
in
kwargs
.
items
()))
try
:
args
,
dictWithBlockAndThreadNumbers
=
cache
[
key
]
func
(
*
args
,
**
dictWithBlockAndThreadNumbers
)
except
KeyError
:
fullArguments
=
argumentDict
.
copy
()
fullArguments
.
update
(
kwargs
)
shape
=
_checkArguments
(
parameters
,
fullArguments
)
indexing
=
kernelFunctionNode
.
indexing
dictWithBlockAndThreadNumbers
=
indexing
.
getCallParameters
(
shape
)
args
=
_buildNumpyArgumentList
(
parameters
,
fullArguments
)
cache
[
key
]
=
(
args
,
dictWithBlockAndThreadNumbers
)
func
(
*
args
,
**
dictWithBlockAndThreadNumbers
)
#cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
return
wrapper
...
...
@@ -64,6 +72,7 @@ def _buildNumpyArgumentList(parameters, argumentDict):
param
=
argumentDict
[
arg
.
name
]
expectedType
=
arg
.
dtype
.
numpyDtype
result
.
append
(
expectedType
(
param
))
assert
len
(
result
)
==
len
(
parameters
)
return
result
...
...
gpucuda/indexing.py
View file @
771a9b22
import
abc
import
sympy
as
sp
import
math
import
pycuda.driver
as
cuda
...
...
@@ -8,68 +9,108 @@ from pystencils.astnodes import Conditional, Block
BLOCK_IDX
=
list
(
sp
.
symbols
(
"blockIdx.x blockIdx.y blockIdx.z"
))
THREAD_IDX
=
list
(
sp
.
symbols
(
"threadIdx.x threadIdx.y threadIdx.z"
))
# Part 1:
# given a field and the number of ghost layers, return the x, y and z coordinates
# dependent on CUDA thread and block indices
# Part 2:
# given the actual field size, determine the call parameters i.e. # of blocks and threads
class
AbstractIndexing
(
abc
.
ABCMeta
(
'ABC'
,
(
object
,),
{})):
"""
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
a pystencils field and the number of ghost layers as well as further optional parameters that must have default
values.
"""
class
LineIndexing
:
def
__init__
(
self
,
field
,
ghostLayers
):
availableIndices
=
[
THREAD_IDX
[
0
]]
+
BLOCK_IDX
if
field
.
spatialDimensions
>
4
:
raise
NotImplementedError
(
"This indexing scheme supports at most 4 spatial dimensions"
)
coordinates
=
availableIndices
[:
field
.
spatialDimensions
]
fastestCoordinate
=
field
.
layout
[
-
1
]
coordinates
[
0
],
coordinates
[
fastestCoordinate
]
=
coordinates
[
fastestCoordinate
],
coordinates
[
0
]
self
.
_coordiantesNoGhostLayer
=
coordinates
self
.
_coordinates
=
[
i
+
ghostLayers
for
i
in
coordinates
]
self
.
_ghostLayers
=
ghostLayers
@
abc
.
abstractproperty
def
coordinates
(
self
):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
These symbolic indices can be obtained with the method `indexVariables` """
@
property
def
coordinates
(
self
):
return
self
.
_coordinates
def
indexVariables
(
self
):
"""Sympy symbols for CUDA's block and thread indices"""
return
BLOCK_IDX
+
THREAD_IDX
@
abc
.
abstractmethod
def
getCallParameters
(
self
,
arrShape
):
def
getShapeOfCudaIdx
(
cudaIdx
):
if
cudaIdx
not
in
self
.
_coordiantesNoGhostLayer
:
return
1
else
:
return
arrShape
[
self
.
_coordiantesNoGhostLayer
.
index
(
cudaIdx
)]
-
2
*
self
.
_ghostLayers
"""
Determine grid and block size for kernel call
:param arrShape: the numeric (not symbolic) shape of the array
:return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
the kernel should be started with
"""
@
abc
.
abstractmethod
def
guard
(
self
,
kernelContent
,
arrShape
):
"""
In some indexing schemes not all threads of a block execute the kernel content.
This function can return a Conditional ast node, defining this execution guard.
:param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
:param arrShape: the numeric or symbolic shape of the field
:return: ast node, which is put inside the kernel function
"""
return
{
'block'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
THREAD_IDX
]),
'grid'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
BLOCK_IDX
])}
def
guard
(
self
,
kernelContent
,
arrShape
):
return
kernelContent
# -------------------------------------------- Implementations ---------------------------------------------------------
@
property
def
indexVariables
(
self
):
return
BLOCK_IDX
+
THREAD_IDX
class
BlockIndexing
(
AbstractIndexing
):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
class
BlockIndexing
:
def
__init__
(
self
,
field
,
ghostLayers
,
blockSize
=
(
256
,
8
,
1
),
permuteBlockSizeDependentOnLayout
=
True
):
"""
Creates
:param field: pystencils field (common to all Indexing classes)
:param ghostLayers: number of ghost layers (common to all Indexing classes)
:param blockSize: size of a CUDA block
:param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
gets the largest amount of threads
"""
if
field
.
spatialDimensions
>
3
:
raise
NotImplementedError
(
"This indexing scheme supports at most 3 spatial dimensions"
)
if
permuteBlockSizeDependentOnLayout
:
blockSize
=
self
.
permuteBlockSizeAccordingToLayout
(
blockSize
,
field
.
layout
)
self
.
_blockSize
=
self
.
limitBlockSizeToDeviceMaximum
(
blockSize
)
blockSize
=
self
.
limitBlockSizeToDeviceMaximum
(
blockSize
)
self
.
_blockSize
=
blockSize
self
.
_coordinates
=
[
blockIndex
*
bs
+
threadIndex
+
ghostLayers
for
blockIndex
,
bs
,
threadIndex
in
zip
(
BLOCK_IDX
,
blockSize
,
THREAD_IDX
)]
self
.
_coordinates
=
self
.
_coordinates
[:
field
.
spatialDimensions
]
self
.
_ghostLayers
=
ghostLayers
@
property
def
coordinates
(
self
):
return
self
.
_coordinates
def
getCallParameters
(
self
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
arrShape
[:
dim
]
grid
=
tuple
(
math
.
ceil
(
length
/
blockSize
)
for
length
,
blockSize
in
zip
(
arrShape
,
self
.
_blockSize
))
extendBs
=
(
1
,)
*
(
3
-
len
(
self
.
_blockSize
))
extendGr
=
(
1
,)
*
(
3
-
len
(
grid
))
return
{
'block'
:
self
.
_blockSize
+
extendBs
,
'grid'
:
grid
+
extendGr
}
def
guard
(
self
,
kernelContent
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
arrShape
[:
dim
]
conditions
=
[
c
<
shapeComponent
-
self
.
_ghostLayers
for
c
,
shapeComponent
in
zip
(
self
.
_coordinates
,
arrShape
)]
condition
=
conditions
[
0
]
for
c
in
conditions
[
1
:]:
condition
=
sp
.
And
(
condition
,
c
)
return
Block
([
Conditional
(
condition
,
kernelContent
)])
@
staticmethod
def
limitBlockSizeToDeviceMaximum
(
blockSize
):
"""
Changes blocksize according to match device limits according to the following rules:
1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
component is multiplied by 2, such that the total amount of threads stays the same
Returns the altered blockSize
"""
# Get device limits
da
=
cuda
.
device_attribute
device
=
cuda
.
Context
.
get_device
()
...
...
@@ -117,7 +158,7 @@ class BlockIndexing:
@
staticmethod
def
permuteBlockSizeAccordingToLayout
(
blockSize
,
layout
):
"""
T
he fastest coordinate gets the biggest block dimension"""
"""
Returns modified blockSize such that t
he fastest coordinate gets the biggest block dimension"""
sortedBlockSize
=
list
(
sorted
(
blockSize
,
reverse
=
True
))
while
len
(
sortedBlockSize
)
>
len
(
layout
):
sortedBlockSize
[
0
]
*=
sortedBlockSize
[
-
1
]
...
...
@@ -128,34 +169,42 @@ class BlockIndexing:
result
[
l
]
=
bs
return
tuple
(
result
[:
len
(
layout
)])
class
LineIndexing
(
AbstractIndexing
):
"""
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
def
__init__
(
self
,
field
,
ghostLayers
):
availableIndices
=
[
THREAD_IDX
[
0
]]
+
BLOCK_IDX
if
field
.
spatialDimensions
>
4
:
raise
NotImplementedError
(
"This indexing scheme supports at most 4 spatial dimensions"
)
coordinates
=
availableIndices
[:
field
.
spatialDimensions
]
fastestCoordinate
=
field
.
layout
[
-
1
]
coordinates
[
0
],
coordinates
[
fastestCoordinate
]
=
coordinates
[
fastestCoordinate
],
coordinates
[
0
]
self
.
_coordiantesNoGhostLayer
=
coordinates
self
.
_coordinates
=
[
i
+
ghostLayers
for
i
in
coordinates
]
self
.
_ghostLayers
=
ghostLayers
@
property
def
coordinates
(
self
):
return
self
.
_coordinates
def
getCallParameters
(
self
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
arrShape
[:
dim
]
grid
=
tuple
(
math
.
ceil
(
length
/
blockSize
)
for
length
,
blockSize
in
zip
(
arrShape
,
self
.
_blockSize
))
extendBs
=
(
1
,)
*
(
3
-
len
(
self
.
_blockSize
))
extendGr
=
(
1
,)
*
(
3
-
len
(
grid
))
return
{
'block'
:
self
.
_blockSize
+
extendBs
,
'grid'
:
grid
+
extendGr
}
def
guard
(
self
,
kernelContent
,
arrShape
):
dim
=
len
(
self
.
_coordinates
)
arrShape
=
arrShape
[:
dim
]
conditions
=
[
c
<
shapeComponent
-
self
.
_ghostLayers
for
c
,
shapeComponent
in
zip
(
self
.
_coordinates
,
arrShape
)]
condition
=
conditions
[
0
]
for
c
in
conditions
[
1
:]:
condition
=
sp
.
And
(
condition
,
c
)
return
Block
([
Conditional
(
condition
,
kernelContent
)])
def
getShapeOfCudaIdx
(
cudaIdx
):
if
cudaIdx
not
in
self
.
_coordiantesNoGhostLayer
:
return
1
else
:
return
arrShape
[
self
.
_coordiantesNoGhostLayer
.
index
(
cudaIdx
)]
-
2
*
self
.
_ghostLayers
@
property
def
indexVariables
(
self
):
return
BLOCK_IDX
+
THREAD_IDX
return
{
'block'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
THREAD_IDX
]),
'grid'
:
tuple
([
getShapeOfCudaIdx
(
idx
)
for
idx
in
BLOCK_IDX
])}
if
__name__
==
'__main__'
:
bs
=
BlockIndexing
.
permuteBlockSizeAccordingToLayout
((
256
,
8
,
1
),
(
0
,))
bs
=
BlockIndexing
.
limitBlockSizeToDeviceMaximum
(
bs
)
print
(
bs
)
def
guard
(
self
,
kernelContent
,
arrShape
):
return
kernelContent
gpucuda/kernelcreation.py
View file @
771a9b22
from
pystencils.gpucuda.indexing
import
BlockIndexing
,
LineIndexing
from
pystencils.gpucuda.indexing
import
BlockIndexing
from
pystencils.transformations
import
resolveFieldAccesses
,
typeAllEquations
,
parseBasePointerInfo
,
getCommonShape
from
pystencils.astnodes
import
Block
,
KernelFunction
,
SympyAssignment
from
pystencils.types
import
TypedSymbol
,
BasicType
,
StructType
...
...
@@ -15,7 +15,7 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
fieldAccesses
.
update
(
eq
.
atoms
(
Field
.
Access
))
requiredGhostLayers
=
max
([
fa
.
requiredGhostLayers
for
fa
in
fieldAccesses
])
indexing
=
indexingCreator
(
list
(
fieldsRead
)[
0
],
requiredGhostLayers
)
indexing
=
indexingCreator
(
field
=
list
(
fieldsRead
)[
0
],
ghostLayers
=
requiredGhostLayers
)
block
=
Block
(
assignments
)
block
=
indexing
.
guard
(
block
,
getCommonShape
(
allFields
))
...
...
@@ -63,7 +63,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
coordinateSymbolAssignments
=
[
getCoordinateSymbolAssignment
(
n
)
for
n
in
coordinateNames
[:
spatialCoordinates
]]
coordinateTypedSymbols
=
[
eq
.
lhs
for
eq
in
coordinateSymbolAssignments
]
indexing
=
indexingCreator
(
list
(
indexFields
)[
0
],
ghostLayers
=
0
)
indexing
=
indexingCreator
(
field
=
list
(
indexFields
)[
0
],
ghostLayers
=
0
)
functionBody
=
Block
(
coordinateSymbolAssignments
+
assignments
)
functionBody
=
indexing
.
guard
(
functionBody
,
getCommonShape
(
indexFields
))
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment