Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ExaStencils
exastencils-release
Commits
c2c43e32
Commit
c2c43e32
authored
Mar 17, 2022
by
Richard Angersbach
Browse files
Merge remote-tracking branch 'origin/master' into devel/matrix_init_fix
parents
d9d5039b
e5f5e5c9
Changes
63
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
c2c43e32
This diff is collapsed.
Click to expand it.
Compiler/src/exastencils/app/ir/IR_LayerHandler.scala
View file @
c2c43e32
...
...
@@ -52,7 +52,9 @@ import exastencils.stencil.ir._
import
exastencils.timing.ir._
import
exastencils.util._
import
exastencils.util.ir._
import
exastencils.visualization.ir._
import
exastencils.visualization.ir.cimg.IR_ResolveCImgFunctions
import
exastencils.visualization.ir.visit.IR_SetupVisit
import
exastencils.visualization.ir.vtk.IR_ResolveVtkPrinters
/// IR_LayerHandler
...
...
@@ -169,6 +171,9 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
// simplify indices modified just now, otherwise equality checks will not work later on
IR_GeneralSimplify
.
apply
()
if
(
Knowledge
.
visit_enable
)
IR_SetupVisit
.
apply
()
var
convChanged
=
false
do
{
IR_FindStencilConvolutions
.
changed
=
false
...
...
@@ -185,9 +190,6 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
IR_ResolveStencilFunction
.
apply
()
if
(
Knowledge
.
experimental_visit_enable
)
IR_SetupVisit
.
apply
()
// resolve new virtual field accesses
IR_ResolveIntegrateOnGrid
.
apply
()
IR_ResolveEvaluateOnGrid
.
apply
()
...
...
Compiler/src/exastencils/applications/ir/IR_HandleMainApplication.scala
View file @
c2c43e32
...
...
@@ -21,7 +21,8 @@ package exastencils.applications.ir
import
scala.collection.mutable.ListBuffer
import
exastencils.base.ir.IR_ImplicitConversion._
import
exastencils.base.ir.
{
IR_Native
,
_
}
import
exastencils.base.ir.IR_Native
import
exastencils.base.ir._
import
exastencils.config.Knowledge
import
exastencils.datastructures._
import
exastencils.logger.Logger
...
...
Compiler/src/exastencils/applications/ns/ir/IR_PrintVtkNNF.scala
View file @
c2c43e32
...
...
@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import
exastencils.field.ir._
import
exastencils.parallelization.api.mpi._
import
exastencils.util.ir.IR_Print
import
exastencils.visualization.ir.IR_PrintVtkQuads
import
exastencils.visualization.ir.
vtk.
IR_PrintVtkQuads
/// IR_PrintVtkNNF
...
...
Compiler/src/exastencils/applications/ns/ir/IR_PrintVtkNS.scala
View file @
c2c43e32
...
...
@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import
exastencils.field.ir._
import
exastencils.parallelization.api.mpi._
import
exastencils.util.ir.IR_Print
import
exastencils.visualization.ir.IR_PrintVtkQuads
import
exastencils.visualization.ir.
vtk.
IR_PrintVtkQuads
/// IR_PrintVtkNS
...
...
Compiler/src/exastencils/applications/swe/ir/IR_PrintVtkSWE.scala
View file @
c2c43e32
...
...
@@ -32,7 +32,7 @@ import exastencils.grid.ir.IR_AtNode
import
exastencils.logger.Logger
import
exastencils.parallelization.api.mpi._
import
exastencils.util.ir.IR_Print
import
exastencils.visualization.ir.IR_PrintVtkTriangles
import
exastencils.visualization.ir.
vtk.
IR_PrintVtkTriangles
/// IR_PrintVtkSWE
...
...
Compiler/src/exastencils/baseExt/ir/IR_MatOperations/IR_RuntimeMatOps.scala
View file @
c2c43e32
...
...
@@ -311,6 +311,20 @@ object IR_GenerateBasicMatrixOperations {
stmts
}
// copy a submatrix of n_rows x n_cols to 'copy' from position 'offset_r', 'offset_c' in 'source' with size 'sourcesize'
def
loopCompoundAssignSubmatrixPointer
(
source
:
IR_Expression
,
sourcesize
:
IR_Expression
,
dest
:
IR_Expression
,
offset_r
:
IR_Expression
,
offset_c
:
IR_Expression
,
n_rows
:
IR_Expression
,
n_cols
:
IR_Expression
,
op
:
String
)
:
IR_Scope
=
{
var
stmts
=
IR_Scope
(
Nil
)
var
i
=
IR_VariableAccess
(
"i"
,
IR_IntegerDatatype
)
var
j
=
IR_VariableAccess
(
"j"
,
IR_IntegerDatatype
)
stmts
.
body
+=
IR_ForLoop
(
IR_VariableDeclaration
(
i
,
offset_r
),
IR_Lower
(
i
,
n_rows
+
offset_r
),
IR_PreIncrement
(
i
),
ListBuffer
[
IR_Statement
](
IR_ForLoop
(
IR_VariableDeclaration
(
j
,
offset_c
),
IR_Lower
(
j
,
offset_c
+
n_cols
),
IR_PreIncrement
(
j
),
ListBuffer
[
IR_Statement
](
IR_Assignment
(
IR_ArrayAccess
(
dest
,
(
i
-
offset_r
)
*
n_cols
+
j
-
offset_c
),
IR_BinaryOperators
.
createExpression
(
op
,
IR_ArrayAccess
(
dest
,
(
i
-
offset_r
)
*
n_cols
+
j
-
offset_c
),
IR_ArrayAccess
(
source
,
i
*
sourcesize
+
j
)))
))
))
stmts
}
// write a submatrix 'source' of n_rows x n_cols to 'destination' at position 'offset_r', 'offset_c'
def
loopSetSubmatrixMat
(
source
:
IR_Expression
,
destination
:
IR_Expression
,
rows_source
:
IR_Expression
,
cols_source
:
IR_Expression
,
offset_r
:
IR_Expression
,
offset_c
:
IR_Expression
)
:
IR_Scope
=
{
if
(!
isScalar
(
offset_r
)
||
!
isScalar
(
offset_c
))
...
...
Compiler/src/exastencils/config/Knowledge.scala
View file @
c2c43e32
...
...
@@ -668,10 +668,10 @@ object Knowledge {
var
experimental_grid_randomMaxOffset
:
Double
=
0.1
/// student project - Richard / visit
// in-situ visualization with VisIt
var
visit_enable
:
Boolean
=
false
var
experimental_visit_addCurveMesh
:
Boolean
=
false
// TODO
var
experimental_visit_enable
:
Boolean
=
false
/// === constraints and resolutions ===
def
update
()
:
Unit
=
{
...
...
Compiler/src/exastencils/config/Platform.scala
View file @
c2c43e32
...
...
@@ -284,6 +284,13 @@ object Platform {
targetCudaCompiler
match
{
case
"NVCC"
=>
flags
+=
s
" -std=c++11 -O3 -DNDEBUG -lineinfo -arch=sm_${ Platform.hw_cuda_capability }${ Platform.hw_cuda_capabilityMinor }"
// cannot find mpi.h from Globals/Globals.h when compiling with nvcc otherwise
if
(
Knowledge
.
mpi_enabled
)
{
val
mpiWrapperFlags
=
s
"$$(shell $resolveCompiler --showme:compile | sed 's/-pthread//g')"
if
(!
Settings
.
makefile_additionalCudaFlags
.
contains
(
mpiWrapperFlags
))
Settings
.
makefile_additionalCudaFlags
+=
mpiWrapperFlags
}
}
flags
...
...
Compiler/src/exastencils/globals/ir/IR_AddInternalVariables.scala
View file @
c2c43e32
...
...
@@ -157,13 +157,14 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
case
buf
:
CUDA_ReductionDeviceData
=>
val
id
=
buf
.
resolveAccess
(
buf
.
resolveName
(),
IR_LoopOverFragments
.
defIt
,
IR_NullExpression
,
IR_NullExpression
,
IR_NullExpression
,
IR_NullExpression
).
prettyprint
val
totalSize
:
IR_Expression
=
buf
.
numPoints
*
buf
.
targetDt
.
getSizeArray
.
product
if
(
Knowledge
.
data_genVariableFieldSizes
)
{
if
(
deviceBufferSizes
.
contains
(
id
))
deviceBufferSizes
(
id
).
asInstanceOf
[
IR_Maximum
].
args
+=
Duplicate
(
buf
.
s
ize
)
deviceBufferSizes
(
id
).
asInstanceOf
[
IR_Maximum
].
args
+=
Duplicate
(
totalS
ize
)
else
deviceBufferSizes
+=
(
id
->
IR_Maximum
(
ListBuffer
(
Duplicate
(
buf
.
s
ize
))))
deviceBufferSizes
+=
(
id
->
IR_Maximum
(
ListBuffer
(
Duplicate
(
totalS
ize
))))
}
else
{
val
size
=
IR_SimplifyExpression
.
evalIntegral
(
buf
.
s
ize
)
val
size
=
IR_SimplifyExpression
.
evalIntegral
(
totalS
ize
)
deviceBufferSizes
+=
(
id
->
(
size
max
deviceBufferSizes
.
getOrElse
(
id
,
IR_IntegerConstant
(
0
)).
asInstanceOf
[
IR_IntegerConstant
].
v
))
}
buf
...
...
@@ -214,7 +215,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val
id
=
buf
.
resolveAccess
(
buf
.
resolveName
(),
IR_LoopOverFragments
.
defIt
,
IR_NullExpression
,
buf
.
field
.
index
,
buf
.
field
.
level
,
buf
.
neighIdx
).
prettyprint
val
size
=
deviceBufferSizes
(
id
)
deviceBufferAllocs
+=
(
id
->
IR_LoopOverFragments
(
CUDA_Allocate
(
buf
,
size
,
IR_RealDatatype
/*FIXME*/
),
IR_ParallelizationInfo
(
potentiallyParallel
=
true
)))
deviceBufferAllocs
+=
(
id
->
IR_LoopOverFragments
(
CUDA_Allocate
(
buf
,
size
,
buf
.
field
.
resolveBaseDatatype
),
IR_ParallelizationInfo
(
potentiallyParallel
=
true
)))
buf
...
...
@@ -222,7 +223,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val
id
=
buf
.
resolveAccess
(
buf
.
resolveName
(),
IR_LoopOverFragments
.
defIt
,
IR_NullExpression
,
IR_NullExpression
,
IR_NullExpression
,
IR_NullExpression
).
prettyprint
val
size
=
deviceBufferSizes
(
id
)
deviceBufferAllocs
+=
(
id
->
IR_LoopOverFragments
(
CUDA_Allocate
(
buf
,
size
,
IR_RealDatatype
/*FIXME*/
),
IR_ParallelizationInfo
(
potentiallyParallel
=
true
)))
deviceBufferAllocs
+=
(
id
->
IR_LoopOverFragments
(
CUDA_Allocate
(
buf
,
size
,
buf
.
baseDt
),
IR_ParallelizationInfo
(
potentiallyParallel
=
true
)))
buf
...
...
Compiler/src/exastencils/optimization/ir/IR_SimplifyExpression.scala
View file @
c2c43e32
...
...
@@ -205,6 +205,10 @@ object IR_SimplifyExpression {
res
=
new
mutable
.
HashMap
[
IR_Expression
,
Long
]()
res
(
m
)
=
1L
case
m
:
IR_MemberFunctionCall
=>
res
=
new
mutable
.
HashMap
[
IR_Expression
,
Long
]()
res
(
m
)
=
1L
case
IR_StringLiteral
(
varName
)
=>
res
=
new
HashMap
[
IR_Expression
,
Long
]()
res
(
IR_VariableAccess
(
varName
,
IR_IntegerDatatype
))
=
1L
// ONLY VariableAccess in res keys, NO StringLiteral
...
...
Compiler/src/exastencils/parallelization/api/cuda/CUDA_ExtractDeviceCode.scala
View file @
c2c43e32
...
...
@@ -21,13 +21,20 @@ package exastencils.parallelization.api.cuda
import
scala.annotation.tailrec
import
scala.collection.mutable
import
scala.collection.mutable.ListBuffer
import
exastencils.base.ir.IR_ImplicitConversion._
import
exastencils.base.ir._
import
exastencils.baseExt.ir.IR_MatOperations.IR_GenerateBasicMatrixOperations
import
exastencils.baseExt.ir._
import
exastencils.config.Knowledge
import
exastencils.core._
import
exastencils.datastructures._
import
exastencils.logger.Logger
import
exastencils.optimization.ir.IR_SimplifyExpression
import
exastencils.parallelization.ir.IR_HasParallelizationInfo
import
exastencils.solver.ir.IR_InlineMatSolveStmts
import
exastencils.util.ir.IR_FctNameCollector
import
exastencils.util.ir.IR_StackCollector
/// CUDA_ExtractHostAndDeviceCode
...
...
@@ -35,10 +42,14 @@ import exastencils.util.ir.IR_FctNameCollector
* This transformation is used to convert annotated code into CUDA kernel code.
*/
object
CUDA_ExtractHostAndDeviceCode
extends
DefaultStrategy
(
"Transform annotated CUDA loop in kernel code"
)
{
val
collector
=
new
IR_FctNameCollector
this
.
register
(
collector
)
val
fctNameCollector
=
new
IR_FctNameCollector
val
stackCollector
=
new
IR_StackCollector
this
.
register
(
fctNameCollector
)
this
.
register
(
stackCollector
)
this
.
onBefore
=
()
=>
this
.
resetCollectors
()
var
enclosingFragmentLoops
:
mutable.HashMap
[
IR_ScopedStatement
with
IR_HasParallelizationInfo
,
IR_Reduction
]
=
mutable
.
HashMap
()
/**
* Collect all loops in the band.
*
...
...
@@ -73,6 +84,31 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
}
}
this
+=
Transformation
(
"Find reductions with enclosing fragment loops"
,
{
case
loop
:
IR_ForLoop
if
loop.hasAnnotation
(
CUDA_Util.CUDA_LOOP_ANNOTATION
)
&&
loop.getAnnotation
(
CUDA_Util.CUDA_LOOP_ANNOTATION
)
.contains
(
CUDA_Util.CUDA_BAND_START
)
=>
val
enclosing
=
stackCollector
.
stack
.
collectFirst
{
case
fragLoop
:
IR_LoopOverFragments
=>
fragLoop
case
fragLoop
@
IR_ForLoop
(
IR_VariableDeclaration
(
_
,
name
,
_
,
_
),
_
,
_
,
_
,
_
)
if
name
==
IR_LoopOverFragments
.
defIt
.
name
=>
fragLoop
}
val
fragLoopIsSerial
=
!
Knowledge
.
omp_enabled
||
(
Knowledge
.
omp_enabled
&&
!
Knowledge
.
omp_parallelizeLoopOverFragments
)
if
(
enclosing
.
isDefined
&&
fragLoopIsSerial
&&
loop
.
parallelization
.
reduction
.
isDefined
)
enclosingFragmentLoops
+=
(
enclosing
.
get
->
loop
.
parallelization
.
reduction
.
get
)
loop
},
false
)
// enclosed by a fragment loop -> create fragment-local copies of the initial value
// and perform reduction after frag loop
this
+=
Transformation
(
"Modify enclosing fragment loops"
,
{
case
fragLoop
:
IR_LoopOverFragments
if
enclosingFragmentLoops.contains
(
fragLoop
)
=>
CUDA_HandleFragmentLoopsWithReduction
(
fragLoop
,
enclosingFragmentLoops
(
fragLoop
))
case
fragLoop
@
IR_ForLoop
(
IR_VariableDeclaration
(
_
,
name
,
_
,
_
),
_
,
_
,
_
,
_
)
if
enclosingFragmentLoops
.
contains
(
fragLoop
)
&&
name
==
IR_LoopOverFragments
.
defIt
.
name
=>
CUDA_HandleFragmentLoopsWithReduction
(
fragLoop
,
enclosingFragmentLoops
(
fragLoop
))
},
false
)
this
+=
new
Transformation
(
"Processing ForLoopStatement nodes"
,
{
case
loop
:
IR_ForLoop
if
loop.hasAnnotation
(
CUDA_Util.CUDA_LOOP_ANNOTATION
)
&&
loop.getAnnotation
(
CUDA_Util.CUDA_LOOP_ANNOTATION
)
.contains
(
CUDA_Util.CUDA_BAND_START
)
=>
...
...
@@ -102,10 +138,74 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// add kernel and kernel call
val
kernelFunctions
=
CUDA_KernelFunctions
.
get
// collect local variable accesses because these variables need to be passed to the kernel at call
CUDA_GatherVariableAccess
.
clear
()
CUDA_GatherVariableAccess
.
applyStandalone
(
IR_Scope
(
loop
))
val
variableAccesses
=
CUDA_GatherVariableAccess
.
accesses
.
toSeq
.
sortBy
(
_
.
_1
).
map
(
_
.
_2
).
to
[
ListBuffer
]
val
kernelCount
=
kernelFunctions
.
counterMap
.
getOrElse
(
fctNameCollector
.
getCurrentName
,
-
1
)
+
1
val
reduction
=
loop
.
parallelization
.
reduction
// local variable for kernels with reductions
val
localTarget
=
if
(
reduction
.
isDefined
)
Some
(
IR_VariableAccess
(
reduction
.
get
.
targetName
+
"_local_"
+
kernelCount
,
CUDA_Util
.
getReductionDatatype
(
reduction
.
get
.
target
)))
else
None
// collect local accesses because their variables need to be passed to the kernel when calling
CUDA_GatherVariableAccesses
.
clear
()
CUDA_GatherVariableAccesses
.
kernelCount
=
kernelCount
if
(
reduction
.
isDefined
)
CUDA_GatherVariableAccesses
.
reductionTarget
=
Some
(
reduction
.
get
.
target
)
CUDA_GatherVariableAccesses
.
applyStandalone
(
IR_Scope
(
loop
))
// declare and init local reduction target
if
(
localTarget
.
isDefined
)
{
var
decl
=
IR_VariableDeclaration
(
localTarget
.
get
)
var
initLocalTarget
=
CUDA_Util
.
getReductionDatatype
(
reduction
.
get
.
target
)
match
{
case
_
:
IR_ScalarDatatype
=>
ListBuffer
[
IR_Statement
](
IR_Assignment
(
localTarget
.
get
,
reduction
.
get
.
target
))
case
mat
:
IR_MatrixDatatype
=>
reduction
.
get
.
target
match
{
case
vAcc
:
IR_VariableAccess
=>
IR_GenerateBasicMatrixOperations
.
loopSetSubmatrixMatPointer
(
vAcc
,
localTarget
.
get
,
mat
.
sizeN
,
mat
.
sizeM
,
mat
.
sizeN
,
0
,
0
).
body
case
expr
=>
Logger
.
error
(
"Cannot set submatrix for expression: "
+
expr
)
}
}
// also detect accesses coming from the init of the local target
CUDA_GatherVariableAccesses
.
applyStandalone
(
IR_Scope
(
decl
))
CUDA_GatherVariableAccesses
.
applyStandalone
(
IR_Scope
(
initLocalTarget
))
// replace array accesses with accesses to function arguments
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
None
// actually allow reduction var to be replaced here
CUDA_ReplaceNonReductionVarArrayAccesses
.
applyStandalone
(
IR_Scope
(
decl
))
CUDA_ReplaceNonReductionVarArrayAccesses
.
applyStandalone
(
IR_Scope
(
initLocalTarget
))
kernelBody
.
prepend
(
initLocalTarget
:
_
*
)
kernelBody
.
prepend
(
decl
)
}
// access collections
val
accesses
=
CUDA_GatherVariableAccesses
.
evaluableAccesses
.
toSeq
.
sortBy
(
_
.
_1
).
to
[
ListBuffer
]
val
accessesCopiedToDevice
=
CUDA_GatherVariableAccesses
.
nonEvaluableAccesses
.
toSeq
.
sortBy
(
_
.
_1
).
to
[
ListBuffer
]
// add non-evaluable accesses in form of pointers to device copies
val
deviceArrayCopies
=
accessesCopiedToDevice
.
map
{
case
(
k
,
v
)
=>
val
copyName
=
CUDA_GatherVariableAccesses
.
arrayVariableAccessAsString
(
v
.
_1
)
val
copyDt
=
IR_PointerDatatype
(
v
.
_2
.
resolveBaseDatatype
)
(
k
,
IR_VariableAccess
(
copyName
,
copyDt
))
}.
toMap
// parameters of the kernel
val
params
=
ListBuffer
[
IR_FunctionArgument
]()
params
++=
accesses
.
map
{
case
(
name
,
tup
)
=>
IR_FunctionArgument
(
name
,
tup
.
_2
)
}
params
++=
deviceArrayCopies
.
values
.
map
(
IR_FunctionArgument
(
_
))
// args passed to kernel
val
args
=
ListBuffer
[
IR_Expression
]()
args
++=
accesses
.
map
{
case
(
_
,
tup
)
=>
tup
.
_1
:
IR_Expression
}
args
++=
deviceArrayCopies
.
values
var
extremaMap
=
mutable
.
HashMap
[
String
,
(
Long
,
Long
)]()
...
...
@@ -113,32 +213,78 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
extremaMap
=
m
.
asInstanceOf
[
mutable.HashMap
[
String
,
(
Long
,
Long
)]]
// inline contained calls to solve functions to avoid separate compilation units
IR_InlineMatSolveStmts
.
applyStandalone
(
kernelBody
)
IR_InlineMatSolveStmts
.
applyStandalone
(
IR_Scope
(
kernelBody
))
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if
(
reduction
.
isDefined
)
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
Some
(
reduction
.
get
.
target
)
else
CUDA_ReplaceNonReductionVarArrayAccesses
.
reductionTarget
=
None
CUDA_ReplaceNonReductionVarArrayAccesses
.
applyStandalone
(
IR_Scope
(
kernelBody
))
val
kernel
=
CUDA_Kernel
(
kernelFunctions
.
getIdentifier
(
collector
.
getCurrentName
),
kernelCount
,
kernelFunctions
.
getIdentifier
(
fctNameCollector
.
getCurrentName
),
parallelInnerLoops
.
length
,
variableAccesses
.
map
(
s
=>
IR_FunctionArgument
(
s
.
name
,
s
.
datatype
))
,
params
,
Duplicate
(
loopVariables
),
Duplicate
(
lowerBounds
),
Duplicate
(
upperBounds
),
Duplicate
(
stepSize
),
Duplicate
(
kernelBody
),
Duplicate
(
loop
.
parallelization
.
reduction
),
Duplicate
(
reduction
),
Duplicate
(
localTarget
),
Duplicate
(
extremaMap
))
kernelFunctions
.
addKernel
(
Duplicate
(
kernel
))
// copy array variables from host to device if necessary
if
(
deviceArrayCopies
.
nonEmpty
)
{
deviceArrayCopies
foreach
{
case
(
k
,
dstArr
)
=>
val
(
srcArr
,
srcDt
)
=
accessesCopiedToDevice
.
find
(
_
.
_1
==
k
).
get
.
_2
deviceStatements
+=
IR_VariableDeclaration
(
dstArr
)
deviceStatements
+=
CUDA_Allocate
(
dstArr
,
srcDt
.
getSizeArray
.
product
,
srcDt
.
resolveBaseDatatype
)
deviceStatements
+=
CUDA_Memcpy
(
dstArr
,
srcArr
,
srcDt
.
typicalByteSize
,
"cudaMemcpyHostToDevice"
)
}
}
// process return value of kernel wrapper call if reduction is required
if
(
loop
.
parallelization
.
reduction
.
isDefined
)
{
val
red
=
loop
.
parallelization
.
reduction
.
get
deviceStatements
+=
IR_Assignment
(
red
.
target
,
IR_BinaryOperators
.
createExpression
(
red
.
op
,
red
.
target
,
IR_FunctionCall
(
kernel
.
getWrapperFctName
,
variableAccesses
.
map
(
_
.
asInstanceOf
[
IR_Expression
]))))
val
callKernel
=
IR_FunctionCall
(
kernel
.
getWrapperFctName
,
args
)
if
(
reduction
.
isDefined
)
{
val
red
=
Duplicate
(
reduction
.
get
)
val
redTarget
=
Duplicate
(
red
.
target
)
val
reductionDt
=
CUDA_Util
.
getReductionDatatype
(
redTarget
)
reductionDt
match
{
case
mat
:
IR_MatrixDatatype
=>
val
baseDt
=
mat
.
resolveBaseDatatype
// declare and allocate tmp buffer for matrix reduction
val
reductionTmp
=
IR_VariableAccess
(
"reductionTmpMatrix"
,
IR_PointerDatatype
(
baseDt
))
deviceStatements
+=
IR_VariableDeclaration
(
reductionTmp
)
deviceStatements
+=
IR_ArrayAllocation
(
reductionTmp
,
baseDt
,
mat
.
sizeN
*
mat
.
sizeM
)
// call kernel and pass allocated tmp buffer by pointer
callKernel
.
arguments
+=
reductionTmp
deviceStatements
+=
callKernel
// update reduction target
deviceStatements
+=
IR_GenerateBasicMatrixOperations
.
loopCompoundAssignSubmatrixPointer
(
reductionTmp
,
mat
.
sizeN
,
red
.
target
,
0
,
0
,
mat
.
sizeM
,
mat
.
sizeN
,
red
.
op
)
// free allocated buffer
deviceStatements
+=
IR_ArrayFree
(
reductionTmp
)
case
_
:
IR_ScalarDatatype
=>
deviceStatements
+=
IR_Assignment
(
red
.
target
,
IR_BinaryOperators
.
createExpression
(
red
.
op
,
red
.
target
,
callKernel
))
}
}
else
{
deviceStatements
+=
IR_FunctionC
all
(
k
ernel
.
getWrapperFctName
,
variableAccesses
.
map
(
_
.
asInstanceOf
[
IR_Expression
]))
deviceStatements
+=
c
all
K
ernel
}
// destroy device copies
if
(
deviceArrayCopies
.
nonEmpty
)
deviceStatements
++=
deviceArrayCopies
.
keys
.
map
(
CUDA_Free
(
_
))
deviceStatements
},
false
)
}
Compiler/src/exastencils/parallelization/api/cuda/CUDA_GatherVariableAccess.scala
deleted
100644 → 0
View file @
d9d5039b
//=============================================================================
//
// This file is part of the ExaStencils code generation framework. ExaStencils
// is free software: you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// ExaStencils is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with ExaStencils. If not, see <http://www.gnu.org/licenses/>.
//
//=============================================================================
package
exastencils.parallelization.api.cuda
import
scala.collection.mutable
import
exastencils.base.ir._
import
exastencils.datastructures._
object
CUDA_GatherVariableAccess
extends
QuietDefaultStrategy
(
"Gather local VariableAccess nodes"
)
{
var
accesses
=
mutable
.
HashMap
[
String
,
IR_VariableAccess
]()
var
ignoredAccesses
=
mutable
.
SortedSet
[
String
]()
def
clear
()
=
{
accesses
=
mutable
.
HashMap
[
String
,
IR_VariableAccess
]()
ignoredAccesses
+=
"std::cout"
ignoredAccesses
+=
"std::cerr"
ignoredAccesses
+=
"std::endl"
}
this
+=
new
Transformation
(
"Searching"
,
{
case
decl
:
IR_VariableDeclaration
=>
ignoredAccesses
+=
decl
.
name
decl
case
access
:
IR_VariableAccess
if
!ignoredAccesses.contains
(
access.name
)
=>
accesses
.
put
(
access
.
name
,
access
)
access
})
}
Compiler/src/exastencils/parallelization/api/cuda/CUDA_GatherVariableAccesses.scala
0 → 100644
View file @
c2c43e32
//=============================================================================
//
// This file is part of the ExaStencils code generation framework. ExaStencils
// is free software: you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// ExaStencils is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with ExaStencils. If not, see <http://www.gnu.org/licenses/>.
//
//=============================================================================
package
exastencils.parallelization.api.cuda
import
scala.collection.mutable
import
exastencils.base.ir._
import
exastencils.baseExt.ir.IR_LoopOverFragments
import
exastencils.config.Knowledge
import
exastencils.datastructures._
import
exastencils.logger.Logger
import
exastencils.optimization.ir.EvaluationException
import
exastencils.optimization.ir.IR_SimplifyExpression
import
exastencils.parallelization.api.cuda.CUDA_Util._
object
CUDA_GatherVariableAccesses
extends
QuietDefaultStrategy
(
"Gather local VariableAccess nodes"
)
{
var
reductionTarget
:
Option
[
IR_Expression
]
=
None
var
kernelCount
:
Int
=
0
var
evaluableAccesses
=
mutable
.
HashMap
[
String
,
(
IR_Access
,
IR_Datatype
)]()
var
nonEvaluableAccesses
=
mutable
.
HashMap
[
String
,
(
IR_VariableAccess
,
IR_Datatype
)]()
var
ignoredAccesses
=
mutable
.
SortedSet
[
String
]()
var
ignoredArrayVariableAccesses
=
mutable
.
SortedSet
[
String
]()
def
basePrefix
(
base
:
IR_VariableAccess
)
=
base
.
name
// regular, evaluable indexed array accesses
def
arrayAccessAsString
(
base
:
IR_VariableAccess
,
idx
:
IR_Expression
)
=
basePrefix
(
base
)
+
idx
.
prettyprint
()
def
containsArrayAccess
(
base
:
IR_VariableAccess
,
idx
:
IR_Expression
)
=
evaluableAccesses
.
contains
(
arrayAccessAsString
(
base
,
idx
))
// array variable accesses in case that a kernel is passed whole array as argument (for non-evaluable indices)
def
arrayVariableAccessAsString
(
base
:
IR_VariableAccess
)
=
s
"${basePrefix(base)}_deviceCopy_$kernelCount"
def
containsArrayVariableAccess
(
base
:
IR_VariableAccess
)
=
nonEvaluableAccesses
.
contains
(
arrayVariableAccessAsString
(
base
))
def
isReplaceable
(
base
:
IR_VariableAccess
,
idx
:
IR_Expression
)
=
containsArrayAccess
(
base
,
idx
)
||
containsArrayVariableAccess
(
base
)
def
replaceAccess
(
base
:
IR_VariableAccess
,
idx
:
IR_Expression
)
:
Option
[
IR_Expression
]
=
{
if
(
isReplaceable
(
base
,
idx
))
{
if
(
containsArrayAccess
(
base
,
idx
))
{
val
name
=
arrayAccessAsString
(
base
,
idx
)
Some
(
IR_VariableAccess
(
name
,
evaluableAccesses
(
name
).
_2
))
}
else
if
(
containsArrayVariableAccess
(
base
))
{
val
name
=
arrayVariableAccessAsString
(
base
)
Some
(
IR_ArrayAccess
(
IR_VariableAccess
(
name
,
base
.
datatype
),
idx
))
}
else
{
Logger
.
error
(
"Error while gathering variables for CUDA kernels"
)
}
}
else
{
None
}
}
def
isEvaluable
(
idx
:
IR_Expression
)
=
{
var
ret
=
true
try
{
IR_SimplifyExpression
.
evalIntegral
(
idx
)
}
catch
{
case
_
:
EvaluationException
=>
ret
=
false
case
_
:
MatchError
=>
ret
=
false
}
ret
}
val
fragIdx
=
IR_LoopOverFragments
.
defIt
def
clear
()
=
{
reductionTarget
=
None
evaluableAccesses
=
mutable
.
HashMap
[
String
,
(
IR_Access
,
IR_Datatype
)]()
nonEvaluableAccesses
=
mutable
.
HashMap
[
String
,
(
IR_VariableAccess
,
IR_Datatype
)]()
ignoredArrayVariableAccesses
=
mutable
.
SortedSet
[
String
]()
ignoredAccesses
=
mutable
.
SortedSet
[
String
]()
ignoredAccesses
+=
"std::cout"
ignoredAccesses
+=
"std::cerr"
ignoredAccesses
+=
"std::endl"
}
this
+=
new
Transformation
(
"Searching"
,
{
case
decl
:
IR_VariableDeclaration
=>
ignoredAccesses
+=
decl
.
name
decl
case
arrAcc
@
IR_ArrayAccess
(
base
:
IR_VariableAccess
,
idx
,
_
)
if
!
ignoredAccesses
.
contains
(
base
.
name
)
=>
ignoredArrayVariableAccesses
+=
base
.
name
if
(
isEvaluable
(
idx
))
{
// single, evaluable array accesses -> count "base[idx]" as variable access
evaluableAccesses
.
put
(
arrayAccessAsString
(
base
,
idx
),
(
arrAcc
,
base
.
datatype
.
resolveBaseDatatype
))
}
else
{
// we found a non-evaluable index -> remove previous evaluable accesses
evaluableAccesses
.
foreach
{
case
(
k
,
_
)
if
k
.
startsWith
(
basePrefix
(
base
))
&&
k
.
length
>
basePrefix
(
base
).
length
=>
evaluableAccesses
.
remove
(
k
)