Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ExaStencils
exastencils-release
Commits
c312aadd
Commit
c312aadd
authored
Mar 03, 2022
by
Richard Angersbach
Browse files
Merge branch 'devel/fix_cuda_reduction_vect' into 'devel/cuda-enhancements'
# Conflicts: # .gitlab-ci.yml
parents
f620c1ea
a1d0f525
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
c312aadd
This diff is collapsed.
Click to expand it.
Compiler/src/exastencils/optimization/ir/IR_Vectorization.scala
View file @
c312aadd
...
...
@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
case
ex
:
VectorizationException
=>
if
(
DEBUG
)
{
val
msg
:
String
=
"[vect] unable to vectorize loop: "
+
ex
.
msg
+
" (line "
+
ex
.
getStackTrace
()(
0
).
getLineNumber
+
')'
Logger
.
warn
(
msg
)
println
(
msg
)
// print directly, logger may be silenced by any surrounding strategy
return
List
(
IR_Comment
(
msg
),
node
)
}
...
...
@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private
var
alignedResidue
:
Long
=
-
1
private
val
nameTempl
:
String
=
"_vec%02d"
private
var
reductionVarArrayAccesses
:
Option
[
IR_ArrayAccess
]
=
None
// init
pushScope
()
...
...
@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
def
getAlignedResidue
()
:
Long
=
{
alignedResidue
}
def
setReductionArrayAccess
(
arrAcc
:
IR_ArrayAccess
)
=
{
reductionVarArrayAccesses
=
Some
(
arrAcc
)
}
def
getReductionArrayAccess
()
=
{
reductionVarArrayAccesses
}
}
private
def
containsVarAcc
(
node
:
IR_Node
,
varName
:
String
)
:
Boolean
=
{
...
...
@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val
ctx
=
new
LoopCtx
(
itVar
,
incr
)
var
postLoopStmt
:
IR_Statement
=
null
if
(
reduction
.
isDefined
)
{
val
target
=
reduction
.
get
.
target
val
target
=
Duplicate
(
reduction
.
get
.
target
)
val
operator
=
reduction
.
get
.
op
target
match
{
case
arrAcc
:
IR_ArrayAccess
=>
ctx
.
setReductionArrayAccess
(
arrAcc
)
case
_
=>
}
val
(
vecTmp
:
String
,
true
)
=
ctx
.
getName
(
target
)
val
identityElem
:
IR_Expression
=
...
...
@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private
def
vectorizeExpr
(
expr
:
IR_Expression
,
ctx
:
LoopCtx
)
:
IR_Expression
=
{
expr
match
{
case
arrAcc
:
IR_ArrayAccess
if
ctx.getReductionArrayAccess
()
.contains
(
arrAcc
)
=>
// vec was already added to ctx and declared
val
(
vecTmp
:
String
,
false
)
=
ctx
.
getName
(
expr
)
IR_VariableAccess
(
vecTmp
,
SIMD_RealDatatype
)
// TODO: do not vectorize if base is not aligned?
case
IR_ArrayAccess
(
base
,
index
,
alignedBase
)
=>
val
(
vecTmp
:
String
,
njuTmp
:
Boolean
)
=
ctx
.
getName
(
expr
)
...
...
Compiler/src/exastencils/parallelization/api/cuda/CUDA_ExtractDeviceCode.scala
View file @
c312aadd
...
...
@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
loop.getAnnotation
(
CUDA_Util.CUDA_LOOP_ANNOTATION
)
.contains
(
CUDA_Util.CUDA_BAND_START
)
=>
// remove the annotation first to guarantee single application of this transformation.
loop
.
a
nnotat
e
(
CUDA_Util
.
CUDA_LOOP_ANNOTATION
)
loop
.
removeA
nnotat
ion
(
CUDA_Util
.
CUDA_LOOP_ANNOTATION
)
val
parallelLoops
=
(
x
:
IR_ForLoop
)
=>
{
x
.
hasAnnotation
(
CUDA_Util
.
CUDA_LOOP_ANNOTATION
)
&&
...
...
@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val
kernelCount
=
kernelFunctions
.
counterMap
.
getOrElse
(
fctNameCollector
.
getCurrentName
,
-
1
)
+
1
val
reduction
=
loop
.
parallelization
.
reduction
val
reduction
=
Duplicate
(
loop
.
parallelization
.
reduction
)
val
redTarget
=
if
(
reduction
.
isDefined
)
Some
(
Duplicate
(
reduction
.
get
.
target
))
else
None
// local variable for kernels with reductions
val
localTarget
=
if
(
reduction
.
isDefined
)
Some
(
IR_VariableAccess
(
reduction
.
get
.
targetName
+
"_local_"
+
kernelCount
,
CUDA_Util
.
getReductionDatatype
(
red
uction
.
get
.
tar
get
)))
Some
(
IR_VariableAccess
(
reduction
.
get
.
targetName
+
"_local_"
+
kernelCount
,
CUDA_Util
.
getReductionDatatype
(
red
Tar
get
.
get
)))
else
None
...
...
@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses
.
clear
()
CUDA_GatherVariableAccesses
.
kernelCount
=
kernelCount
if
(
reduction
.
isDefined
)
CUDA_GatherVariableAccesses
.
reductionTarget
=
Some
(
reduction
.
get
.
t
arget
)
CUDA_GatherVariableAccesses
.
reductionTarget
=
redT
arget
CUDA_GatherVariableAccesses
.
applyStandalone
(
IR_Scope
(
loop
))
// declare and init local reduction target
if
(
localTarget
.
isDefined
)
{
var
decl
=
IR_VariableDeclaration
(
localTarget
.
get
)
var
initLocalTarget
=
CUDA_Util
.
getReductionDatatype
(
red
uction
.
get
.
tar
get
)
match
{
var
initLocalTarget
=
CUDA_Util
.
getReductionDatatype
(
red
Tar
get
.
get
)
match
{
case
_
:
IR_ScalarDatatype
=>
ListBuffer
[
IR_Statement
](
IR_Assignment
(
localTarget
.
get
,
red
uction
.
get
.
tar
get
))
ListBuffer
[
IR_Statement
](
IR_Assignment
(
localTarget
.
get
,
red
Tar
get
.
get
))
case
mat
:
IR_MatrixDatatype
=>
red
uction
.
get
.
tar
get
match
{
red
Tar
get
.
get
match
{
case
vAcc
:
IR_VariableAccess
=>
IR_GenerateBasicMatrixOperations
.
loopSetSubmatrixMatPointer
(
vAcc
,
localTarget
.
get
,
mat
.
sizeN
,
mat
.
sizeM
,
mat
.
sizeN
,
0
,
0
).
body
...
...
@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if
(
reduction
.
isDefined
)
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
Some
(
reduction
.
get
.
t
arget
)
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
redT
arget
else
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
None
CUDA_ReplaceNonReductionVarArrayAccesses
.
applyStandalone
(
IR_Scope
(
kernelBody
))
...
...
Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoopsWithReduction.scala
View file @
c312aadd
...
...
@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val
iter
=
IR_LoopOverFragments
.
defIt
val
redTarget
=
reduction
.
target
val
red
=
Duplicate
(
reduction
)
val
redTarget
=
Duplicate
(
red
.
target
)
val
reductionDt
=
CUDA_Util
.
getReductionDatatype
(
redTarget
)
val
counter
=
CUDA_HandleFragmentLoopsWithReduction
.
getReductionCounter
(
red
uction
.
targetName
)
val
counter
=
CUDA_HandleFragmentLoopsWithReduction
.
getReductionCounter
(
red
.
targetName
)
val
copies
=
{
val
innerDt
=
reductionDt
match
{
case
scalar
:
IR_ScalarDatatype
=>
scalar
case
hodt
:
IR_HigherDimensionalDatatype
=>
IR_ArrayDatatype
(
hodt
.
resolveBaseDatatype
,
hodt
.
getSizeArray
.
product
)
}
IR_VariableAccess
(
red
uction
.
targetName
+
"_"
+
counter
,
IR_ArrayDatatype
(
innerDt
,
Knowledge
.
domain_numFragmentsPerBlock
))
IR_VariableAccess
(
red
.
targetName
+
"_
fragCpy
"
+
counter
,
IR_ArrayDatatype
(
innerDt
,
Knowledge
.
domain_numFragmentsPerBlock
))
}
val
currCopy
=
IR_ArrayAccess
(
copies
,
iter
)
...
...
@@ -80,11 +81,17 @@ case class CUDA_HandleFragmentLoopsWithReduction(
matrixAssignment
(
"std::copy"
,
redTarget
,
currCopy
,
hodt
.
getSizeArray
.
product
)
}
def
initCopies
()
=
ListBuffer
(
IR_VariableDeclaration
(
copies
),
// declare copies
IR_LoopOverFragments
(
// init copies
copyReductionTarget
()),
resetReductionTarget
())
// reset initial value as it is already in the copies
def
initCopies
()
=
{
val
declCopies
=
IR_VariableDeclaration
(
copies
)
val
initCopies
=
IR_LoopOverFragments
(
copyReductionTarget
()).
expandSpecial
().
inner
val
resetRedTarget
=
resetReductionTarget
()
// reset initial value as it is already in the copies
ListBuffer
(
declCopies
,
initCopies
,
resetRedTarget
)
}
def
finalizeReduction
(
body
:
ListBuffer
[
IR_Statement
])
=
{
// finalize reduction
...
...
@@ -99,10 +106,10 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val
src
=
IR_ArrayAccess
(
currCopy
,
idx
)
IR_ForLoop
(
IR_VariableDeclaration
(
i
,
IR_IntegerConstant
(
0
)),
IR_Lower
(
i
,
mat
.
sizeM
),
IR_PreIncrement
(
i
),
ListBuffer
[
IR_Statement
](
IR_ForLoop
(
IR_VariableDeclaration
(
j
,
0
),
IR_Lower
(
j
,
mat
.
sizeN
),
IR_PreIncrement
(
j
),
ListBuffer
[
IR_Statement
](
IR_Assignment
(
dst
,
IR_BinaryOperators
.
createExpression
(
red
uction
.
op
,
dst
,
src
))))))
IR_Assignment
(
dst
,
IR_BinaryOperators
.
createExpression
(
red
.
op
,
dst
,
src
))))))
case
_
:
IR_ScalarDatatype
=>
IR_Assignment
(
redTarget
,
IR_BinaryOperators
.
createExpression
(
red
uction
.
op
,
redTarget
,
currCopy
))
IR_Assignment
(
redTarget
,
IR_BinaryOperators
.
createExpression
(
red
.
op
,
redTarget
,
currCopy
))
}
body
:+
assign
...
...
@@ -110,21 +117,21 @@ case class CUDA_HandleFragmentLoopsWithReduction(
def
replaceAccesses
(
body
:
ListBuffer
[
IR_Statement
])
=
{
// replace occurrences
CUDA_ReplaceReductionAccesses
.
redTarget
=
redTarget
CUDA_ReplaceReductionAccesses
.
replacement
=
currCopy
CUDA_ReplaceReductionAccesses
.
redTarget
=
Duplicate
(
redTarget
)
CUDA_ReplaceReductionAccesses
.
replacement
=
Duplicate
(
currCopy
)
CUDA_ReplaceReductionAccesses
.
applyStandalone
(
IR_Scope
(
body
))
}
def
addHandling
(
loop
:
IR_ForLoop
)
=
{
replaceAccesses
(
loop
.
body
)
loop
.
body
=
finalizeReduction
(
loop
.
body
)
initCopies
()
:+
loop
initCopies
()
:+
Duplicate
(
loop
)
}
def
addHandling
(
loop
:
IR_LoopOverFragments
)
=
{
replaceAccesses
(
loop
.
body
)
loop
.
body
=
finalizeReduction
(
loop
.
body
)
initCopies
()
:+
loop
initCopies
()
:+
Duplicate
(
loop
)
}
override
def
expand
()
:
OutputType
=
{
...
...
Compiler/src/exastencils/parallelization/api/cuda/CUDA_Kernel.scala
View file @
c312aadd
...
...
@@ -593,12 +593,12 @@ case class CUDA_Kernel(
var
body
=
ListBuffer
[
IR_Statement
]()
if
(
reduction
.
isDefined
)
{
def
target
=
reduction
.
get
.
target
def
resultDt
=
CUDA_Util
.
getReductionDatatype
(
target
)
def
baseDt
=
resultDt
.
resolveBaseDatatype
val
target
=
Duplicate
(
reduction
.
get
.
target
)
val
resultDt
=
CUDA_Util
.
getReductionDatatype
(
target
)
val
baseDt
=
resultDt
.
resolveBaseDatatype
def
bufSize
=
requiredThreadsPerDim
.
product
def
bufAccess
=
CUDA_ReductionDeviceData
(
bufSize
,
resultDt
)
val
bufSize
=
requiredThreadsPerDim
.
product
val
bufAccess
=
CUDA_ReductionDeviceData
(
bufSize
,
resultDt
)
var
callArgsReduction
=
ListBuffer
[
IR_Expression
](
bufAccess
,
bufSize
)
body
+=
CUDA_Memset
(
bufAccess
,
0
,
bufSize
,
resultDt
)
...
...
@@ -621,7 +621,7 @@ case class CUDA_Kernel(
IR_Return
(
Some
(
callDefaultReductionKernel
))
})
CUDA_KernelFunctions
.
get
.
requiredRedKernels
+=
Tuple2
(
reduction
.
get
.
op
,
target
)
// request reduction kernel and wrapper
CUDA_KernelFunctions
.
get
.
requiredRedKernels
+=
Tuple2
(
reduction
.
get
.
op
,
Duplicate
(
target
)
)
// request reduction kernel and wrapper
}
else
{
body
+=
CUDA_FunctionCall
(
getKernelFctName
,
callArgs
,
numBlocksPerDim
,
numThreadsPerBlock
)
}
...
...
Compiler/src/exastencils/parallelization/api/cuda/CUDA_Reduction.scala
View file @
c312aadd
...
...
@@ -88,8 +88,8 @@ object CUDA_HandleReductions extends DefaultStrategy("Handle reductions in devic
}
// update local target
CUDA_ReplaceReductionAssignments
.
redTarget
=
target
CUDA_ReplaceReductionAssignments
.
replacement
=
kernel
.
localReductionTarget
.
get
CUDA_ReplaceReductionAssignments
.
redTarget
=
Duplicate
(
target
)
CUDA_ReplaceReductionAssignments
.
replacement
=
Duplicate
(
kernel
.
localReductionTarget
.
get
)
CUDA_ReplaceReductionAssignments
.
applyStandalone
(
IR_Scope
(
kernel
.
body
))
// set element in global reduction buffer to local result
...
...
Testing/run_test.py
View file @
c312aadd
...
...
@@ -90,10 +90,12 @@ def run_test(generator_path: str, problem_name: str, knowledge_path: str, exa_fi
elif
expected_results_path
:
result_str
=
result
.
stdout
.
decode
(
'utf-8'
)
if
check_results
(
result_str
,
expected_results_path
)
is
True
:
print
(
f
"Test for problem
\"
{
problem_name
}
\"
finished successfully."
)
return
result
.
returncode
else
:
return
-
1
else
:
print
(
f
"Test for problem
\"
{
problem_name
}
\"
finished successfully."
)
return
result
.
returncode
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment