Commit 73c32604 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Merge remote-tracking branch 'origin/master' into devel/par_io

# Conflicts:
#	Compiler/src/exastencils/app/ir/IR_LayerHandler.scala
#	Compiler/src/exastencils/config/Knowledge.scala
parents 5f8a701a 4c96a571
......@@ -52,6 +52,7 @@ stages:
- python3 --version
- mpirun --version
- nvcc --version
- nvidia-smi
- updatedb
- locate cuda.h
- locate libcudart
......
......@@ -325,10 +325,18 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
if (Knowledge.data_genVariableFieldSizes)
IR_GenerateIndexManipFcts.apply()
// adapt accesses to device data in case of managed memory
if (Knowledge.cuda_enabled && Knowledge.cuda_useManagedMemory)
CUDA_AdaptDeviceAccessesForMM.apply()
IR_AddInternalVariables.apply()
// resolve possibly newly added constant IVs
IR_ResolveConstIVs.apply()
// adapt allocations and de-allocations before expanding
if (Knowledge.cuda_enabled)
CUDA_AdaptAllocations.apply()
if (Knowledge.useFasterExpand)
IR_ExpandInOnePass.apply()
else
......
......@@ -26,7 +26,6 @@ import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.parallelization.api.cuda._
import exastencils.parallelization.api.mpi._
import exastencils.parallelization.api.omp.OMP_Parallel
......@@ -51,11 +50,6 @@ object IR_HandleMainApplication extends DefaultStrategy("HandleMainApplication")
func.body.append(IR_Native("LIKWID_MARKER_CLOSE"))
}
if (Knowledge.cuda_enabled) {
func.body.prepend(CUDA_Init)
func.body.append(CUDA_Finalize)
}
if (Knowledge.mpi_enabled) {
func.body.prepend(MPI_Init)
func.body.append(MPI_Finalize)
......
......@@ -38,7 +38,7 @@ case class IR_ArrayAllocation(var name : IR_Expression, // no string - could be
case class IR_ScalarAllocation(var name : IR_Expression, // no string - could be an IV
var datatype : IR_Datatype,
) extends IR_Statement {
) extends IR_Statement {
//override def datatype = IR_UnitDatatype
override def prettyprint(out : PpStream) : Unit = out << name << " = " << "new" << ' ' << datatype << ";"
}
......@@ -49,7 +49,6 @@ case class IR_ArrayFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete[] " << pointer << ";"
}
/// IR_ScalarFree
case class IR_ScalarFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete " << pointer << ";"
......
......@@ -212,11 +212,15 @@ object IR_MatNodeUtils {
/** Method: split a declaration with init to declaration and assignment with init
*
* @param decl : IR_VariableDeclaration, declaration to be split
* @return list containing variable declaration without init and assignment of that variable with init expresion
* @return list containing variable declaration without init and assignment of that variable with init expression
* */
def splitDeclaration(decl : IR_VariableDeclaration) : ListBuffer[IR_Statement] = {
def splitDeclaration(decl : IR_VariableDeclaration, zeroInit : Boolean = false) : ListBuffer[IR_Statement] = {
val newStmts = ListBuffer[IR_Statement]()
newStmts += IR_VariableDeclaration(decl.datatype, decl.name, None)
if(zeroInit) {
newStmts += IR_VariableDeclaration(decl.datatype, decl.name, IR_IntegerConstant(0))
} else {
newStmts += IR_VariableDeclaration(decl.datatype, decl.name, None)
}
newStmts += IR_Assignment(IR_VariableAccess(decl), decl.initialValue.getOrElse(IR_NullExpression))
newStmts
}
......
......@@ -118,7 +118,7 @@ object IR_GenerateBasicMatrixOperations {
var _j = IR_VariableAccess("_j", IR_IntegerDatatype)
func.body += IR_ForLoop(IR_VariableDeclaration(_i, IR_IntegerConstant(0)), IR_Lower(_i, sizeMLeft), IR_PreIncrement(_i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(_j, 0), IR_Lower(_j, sizeNLeft), IR_PreIncrement(_j), ListBuffer[IR_Statement](
IR_IfCondition(IR_Greater(IR_FunctionCall(IR_ExternalFunctionReference.fabs, IR_Subtraction(IR_FunctionCall(IR_ExternalFunctionReference.fabs, IR_HighDimAccess(left, IR_ExpressionIndex(_i, _j))), IR_FunctionCall(IR_ExternalFunctionReference.fabs, IR_HighDimAccess(right, IR_ExpressionIndex(_i, _j))))), precision), ListBuffer[IR_Statement](
IR_IfCondition(IR_Greater(IR_FunctionCall(IR_ExternalFunctionReference.fabs, IR_Subtraction(IR_HighDimAccess(left, IR_ExpressionIndex(_i, _j)), IR_HighDimAccess(right, IR_ExpressionIndex(_i, _j)))), precision), ListBuffer[IR_Statement](
IR_Print(outstream, ListBuffer[IR_Expression](IR_StringConstant("[Test] comparison failed at "), _i, IR_StringConstant(" "), _j, IR_StringConstant("\\n"), IR_HighDimAccess(left, IR_ExpressionIndex(_i, _j)), IR_StringConstant(" vs "), IR_HighDimAccess(right, IR_ExpressionIndex(_i, _j)), IR_StringConstant("\\n"))),
if (returnStmt) IR_Return(IR_IntegerConstant(-1)) else IR_NullStatement
), ListBuffer[IR_Statement]())
......@@ -692,9 +692,9 @@ object IR_GenerateRuntimeInversion {
// copy A and invert
//TODO use algorithm that exploits structure -> receive matrix structure information from classifier -> e.g. blockdiagonal
// blocksize of the diagonal blocks of A if A is a blockdiagonal matrix -> later this information comes from the classifyer?
func.body += IR_VariableDeclaration(A)
func.body += IR_VariableDeclaration(A, IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.loopCopySubmatrix(in, A, 0, 0, n, n)
func.body += IR_VariableDeclaration(A_inv)
func.body += IR_VariableDeclaration(A_inv,IR_IntegerConstant(0))
if (structureA == "blockdiagonal")
func.body += IR_GenerateRuntimeInversion.blockdiagonalInlined(A, blockSizeA, A_inv)
else if (structureA == "diagonal")
......@@ -706,33 +706,33 @@ object IR_GenerateRuntimeInversion {
func.body ++= IR_GenerateBasicMatrixOperations.printMatrix(A_inv)
// copy B
func.body += IR_VariableDeclaration(B)
func.body += IR_VariableDeclaration(B, IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.loopCopySubmatrix(in, B, 0, n, n, m)
// copy C
func.body += IR_VariableDeclaration(C)
func.body += IR_VariableDeclaration(C, IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.loopCopySubmatrix(in, C, n, 0, m, n)
// copy D
func.body += IR_VariableDeclaration(D)
func.body += IR_VariableDeclaration(D, IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.loopCopySubmatrix(in, D, n, n, m, m)
// calculate S
func.body += IR_VariableDeclaration(S)
func.body += IR_VariableDeclaration(CA_inv)
func.body += IR_VariableDeclaration(S, IR_IntegerConstant(0))
func.body += IR_VariableDeclaration(CA_inv,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(C, A_inv, CA_inv, m, n, n, 0, 0)
func.body += IR_VariableDeclaration(CA_invB)
func.body += IR_VariableDeclaration(CA_invB,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(CA_inv, B, CA_invB, m, m, n, 0, 0)
func.body += IR_GenerateBasicMatrixOperations.subAtSubmatrix(D, CA_invB, S, m, m, m, 0, 0)
// calculate S_inv
func.body += IR_VariableDeclaration(S_inv)
func.body += IR_VariableDeclaration(S_inv,IR_IntegerConstant(0))
func.body += IR_GenerateRuntimeInversion.inverse(S, S_inv, IR_MatShape("filled"))
// calculate upper right result block
func.body += IR_VariableDeclaration(A_invB)
func.body += IR_VariableDeclaration(A_invB,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(A_inv, B, A_invB, n, m, n, 0, 0)
func.body += IR_VariableDeclaration(A_invBS_inv)
func.body += IR_VariableDeclaration(A_invBS_inv,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(A_invB, S_inv, A_invBS_inv, n, m, m, 0, 0)
func.body += IR_GenerateBasicMatrixOperations.negAtSubmatrix(A_invBS_inv, out, n + m, n, m, 0, n_asInt)
......@@ -745,7 +745,7 @@ object IR_GenerateRuntimeInversion {
func.body ++= IR_GenerateBasicMatrixOperations.printMatrix(S_inv)
// calculate lower left result block
func.body += IR_VariableDeclaration(S_invCA_inv)
func.body += IR_VariableDeclaration(S_invCA_inv,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(S_inv, CA_inv, S_invCA_inv, m, n, m, 0, 0)
func.body += IR_GenerateBasicMatrixOperations.negAtSubmatrix(S_invCA_inv, out, m + n, m, n, n_asInt, 0)
......@@ -753,7 +753,7 @@ object IR_GenerateRuntimeInversion {
func.body ++= IR_GenerateBasicMatrixOperations.printMatrix(S_invCA_inv)
// calculate upper left result block
func.body += IR_VariableDeclaration(A_invBS_invCA_inv)
func.body += IR_VariableDeclaration(A_invBS_invCA_inv,IR_IntegerConstant(0))
func.body += IR_GenerateBasicMatrixOperations.multAtSubmatrix(A_invB, S_invCA_inv, A_invBS_invCA_inv, n, n, m, 0, 0)
func.body += IR_GenerateBasicMatrixOperations.addAtSubmatrix(A_inv, A_invBS_invCA_inv, out, n + m, n, n, 0, 0)
......
......@@ -276,8 +276,12 @@ object IR_ResolveMatFuncs extends DefaultStrategy("Resolve matFuncs") {
*/
this += new Transformation("Insert resolvables and resolve", {
case decl @ IR_VariableDeclaration(_, _, Some(r : IR_RuntimeMNode), _) if r.resolveAtRuntime =>
IR_MatNodeUtils.splitDeclaration(decl)
case decl @ IR_VariableDeclaration(dt, _, Some(r : IR_RuntimeMNode), _) if r.resolveAtRuntime =>
if(dt.isInstanceOf[IR_MatrixDatatype]) {
IR_MatNodeUtils.splitDeclaration(decl, true)
} else {
IR_MatNodeUtils.splitDeclaration(decl)
}
// not to resolve at runtime
case r : IR_RuntimeMNode if !r.resolveAtRuntime =>
......
......@@ -601,6 +601,16 @@ object Knowledge {
// apply spatial blocking with read-only cache
var cuda_spatialBlockingWithROC : Boolean = false
// use pinned memory to allocate host field data and buffers
var cuda_usePinnedHostMemory : Boolean = true
// use managed memory instead of host and device variants for field data and buffers
var cuda_useManagedMemory : Boolean = false
// replace device variants of field data and buffers with device pointers derived from host counter-parts
var cuda_useZeroCopy : Boolean = false
// only relevant if cuda_useManagedMemory == true; replace cuda memcpy with asynchronous prefetches
var cuda_genAsyncPrefetch : Boolean = true
// if true, the first dimension of the block size is enlarged if the kernel dimensionality is lower than the global dimensionality
var cuda_foldBlockSizeForRedDimensionality : Boolean = true
......@@ -733,9 +743,6 @@ object Knowledge {
var experimental_CTPivotElimination : Boolean = false
var experimental_QRPivot : Boolean = false
// eliminate occurrences of cudaContext - required for PizDaint
var experimental_eliminateCudaContext : Boolean = false
// generate cuda kernels independently of them being parallel or not
var experimental_cuda_generateKernelForNonParallel : Boolean = false
......@@ -758,7 +765,6 @@ object Knowledge {
var visit_enable : Boolean = false
var experimental_visit_addCurveMesh : Boolean = false
/// === constraints and resolutions ===
def update() : Unit = {
// NOTE: it is required to call update at least once
......@@ -860,7 +866,13 @@ object Knowledge {
Constraints.condWarn(cuda_enabled && opt_conventionalCSE && !useDblPrecision, "Double precision should be used if CUDA is enabled and CSE should be applied!")
Constraints.condEnsureValue(useDblPrecision, true, cuda_enabled && opt_conventionalCSE)
Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(",") )
Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(","))
Constraints.condEnsureValue(cuda_usePinnedHostMemory, true, cuda_useZeroCopy)
Constraints.condError(cuda_useManagedMemory && cuda_usePinnedHostMemory, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condError(cuda_useManagedMemory && cuda_useZeroCopy, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condEnsureValue(data_alignFieldPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condEnsureValue(data_alignTmpBufferPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condWarn(experimental_splitLoopsForAsyncComm && !comm_onlyAxisNeighbors, s"Using asynchronous communication with comm_onlyAxisNeighbors leads to problems with stencils containing diagonal entries")
......
......@@ -203,7 +203,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
bufferAllocs += (id -> IR_LoopOverFragments(
IR_ArrayAllocation(
buf,
if(buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype
if (buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype
else IR_RealDatatype,
size
), IR_ParallelizationInfo(potentiallyParallel = true)))
......@@ -259,9 +259,11 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
func.body += genericAlloc._2
for (deviceAlloc <- deviceFieldAllocs.toSeq.sortBy(_._1) ++ deviceBufferAllocs.toSeq.sortBy(_._1))
if ("Condition" == Knowledge.cuda_preferredExecution)
func.body += IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), deviceAlloc._2)
else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481
if ("Condition" == Knowledge.cuda_preferredExecution) {
val loop = deviceAlloc._2.asInstanceOf[IR_LoopOverFragments]
loop.body = ListBuffer(IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), loop.body))
func.body += loop
} else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481
func.body += IR_Scope(deviceAlloc._2)
else
func.body += deviceAlloc._2
......
......@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
case ex : VectorizationException =>
if (DEBUG) {
val msg : String = "[vect] unable to vectorize loop: " + ex.msg + " (line " + ex.getStackTrace()(0).getLineNumber + ')'
Logger.warn(msg)
println(msg) // print directly, logger may be silenced by any surrounding strategy
return List(IR_Comment(msg), node)
}
......@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private var alignedResidue : Long = -1
private val nameTempl : String = "_vec%02d"
private var reductionVarArrayAccesses : Option[IR_ArrayAccess] = None
// init
pushScope()
......@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
def getAlignedResidue() : Long = {
alignedResidue
}
def setReductionArrayAccess(arrAcc : IR_ArrayAccess) = {
reductionVarArrayAccesses = Some(arrAcc)
}
def getReductionArrayAccess() = {
reductionVarArrayAccesses
}
}
private def containsVarAcc(node : IR_Node, varName : String) : Boolean = {
......@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val ctx = new LoopCtx(itVar, incr)
var postLoopStmt : IR_Statement = null
if (reduction.isDefined) {
val target = reduction.get.target
val target = Duplicate(reduction.get.target)
val operator = reduction.get.op
target match {
case arrAcc : IR_ArrayAccess => ctx.setReductionArrayAccess(arrAcc)
case _ =>
}
val (vecTmp : String, true) = ctx.getName(target)
val identityElem : IR_Expression =
......@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private def vectorizeExpr(expr : IR_Expression, ctx : LoopCtx) : IR_Expression = {
expr match {
case arrAcc : IR_ArrayAccess if ctx.getReductionArrayAccess().contains(arrAcc) =>
// vec was already added to ctx and declared
val (vecTmp : String, false) = ctx.getName(expr)
IR_VariableAccess(vecTmp, SIMD_RealDatatype)
// TODO: do not vectorize if base is not aligned?
case IR_ArrayAccess(base, index, alignedBase) =>
val (vecTmp : String, njuTmp : Boolean) = ctx.getName(expr)
......
......@@ -31,19 +31,16 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
override def doWork() : Unit = {
val globals = IR_GlobalCollection.get
if (!Knowledge.experimental_eliminateCudaContext)
globals.variables += IR_VariableDeclaration("CUcontext", "cudaContext")
globals.variables += IR_VariableDeclaration("CUdevice", "cudaDevice")
val initFunc = globals.functions.find(_.name == "initGlobals").get.asInstanceOf[IR_Function]
initFunc.body ++= ListBuffer[IR_Statement](
IR_VariableDeclaration(IR_IntegerDatatype, "deviceCount", 0),
"cuDeviceGetCount(&deviceCount)",
"cudaGetDeviceCount(&deviceCount)",
IR_Assert(IR_Lower(Knowledge.cuda_deviceId, "deviceCount"),
ListBuffer("\"Invalid device id (\"", Knowledge.cuda_deviceId, "\") must be smaller than the number of devices (\"", "deviceCount", "\")\""),
IR_FunctionCall("exit", 1)),
s"cuDeviceGet(&cudaDevice, ${ Knowledge.cuda_deviceId })")
IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1)),
s"cudaSetDevice(${ Knowledge.cuda_deviceId })"
)
// print device info (name)
if (!Knowledge.testing_enabled) {
......@@ -53,10 +50,6 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
IR_RawPrint("\"Using CUDA device \"", Knowledge.cuda_deviceId, "\": \"", "devProp.name", "std::endl"))
}
// create context
if (!Knowledge.experimental_eliminateCudaContext)
initFunc.body += "cuCtxCreate(&cudaContext, 0, cudaDevice)"
// set L1 cache and shared memory configuration for this device
if (Knowledge.cuda_useSharedMemory)
initFunc.body += "cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)"
......
......@@ -19,28 +19,8 @@
package exastencils.parallelization.api.cuda
import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.prettyprinting.PpStream
/// CUDA_Init
case object CUDA_Init extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuInit(0);"
}
}
/// CUDA_Finalize
case object CUDA_Finalize extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
// has to be done after all other de-initialization statements
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuCtxDestroy(cudaContext);"
}
}
/// CUDA_DeviceSynchronize
case class CUDA_DeviceSynchronize() extends CUDA_HostStatement with IR_Expandable {
......
......@@ -48,16 +48,16 @@ case class CUDA_CheckError(var exp : IR_Expression) extends CUDA_HostStatement w
def print = IR_RawPrint("\"CUDA error in file (\"", "__FILE__", "\"), line (\"", "__LINE__", "\"): \"", status,
"\" -> \"", IR_FunctionCall(IR_ExternalFunctionReference("cudaGetErrorString"), status), "std::endl")
def printAndExit : ListBuffer[IR_Statement] = ListBuffer(print, IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
ListBuffer(
IR_VariableDeclaration(status, exp),
IR_IfCondition("cudaSuccess" Neq status,
print,
printAndExit,
ListBuffer(
IR_Assignment(status, IR_FunctionCall(IR_ExternalFunctionReference("cudaGetLastError"))),
IR_IfCondition("cudaSuccess" Neq status,
ListBuffer[IR_Statement](
print,
IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
printAndExit
))))
}
}
......@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
// remove the annotation first to guarantee single application of this transformation.
loop.annotate(CUDA_Util.CUDA_LOOP_ANNOTATION)
loop.removeAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION)
val parallelLoops = (x : IR_ForLoop) => {
x.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
......@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction
val reduction = Duplicate(loop.parallelization.reduction)
val redTarget = if (reduction.isDefined)
Some(Duplicate(reduction.get.target))
else
None
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(redTarget.get)))
else
None
......@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.reductionTarget = redTarget
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
var initLocalTarget = CUDA_Util.getReductionDatatype(redTarget.get) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, redTarget.get))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
redTarget.get match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
......@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = redTarget
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
......
......@@ -23,6 +23,7 @@ import scala.collection.mutable._
import exastencils.base.ir._
import exastencils.core.collectors.Collector
import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._
import exastencils.logger.Logger
......@@ -85,6 +86,12 @@ class CUDA_GatherFieldAccess extends Collector {
}
}
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
if (isRead)
fieldAccesses.put("read_" + identifier, access)
if (isWrite)
......
......@@ -25,6 +25,7 @@ import exastencils.base.ir._
import exastencils.config._
import exastencils.datastructures.Transformation._
import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._
import exastencils.optimization.ir.IR_SimplifyExpression
......@@ -49,6 +50,12 @@ object CUDA_GatherFieldAccessLike extends QuietDefaultStrategy("Gather local Fie
}
}
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
identifier
}
......
......@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val iter = IR_LoopOverFragments.defIt
val redTarget = reduction.target
val red = Duplicate(reduction)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(reduction.targetName)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
val copies = {
val innerDt = reductionDt match {
case scalar : IR_ScalarDatatype => scalar
case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)
}
IR_VariableAccess(reduction.targetName + "_" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
IR_VariableAccess(red.targetName + "_fragCpy" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
}
val currCopy = IR_ArrayAccess(copies, iter)
......@@ -80,11 +81,17 @@ case class CUDA_HandleFragmentLoopsWithReduction(
matrixAssignment("std::copy", redTarget, currCopy, hodt.getSizeArray.product)
}
def initCopies() = ListBuffer(
IR_VariableDeclaration(copies), // declare copies
IR_LoopOverFragments( // init copies
copyReductionTarget()),
resetReductionTarget()) // reset initial value as it is already in the copies
def initCopies() = {
val declCopies = IR_VariableDeclaration(copies)
val initCopies = IR_LoopOverFragments(
copyReductionTarget()).expandSpecial().inner
val resetRedTarget = resetReductionTarget() // reset initial value as it is already in the copies
ListBuffer(
declCopies,
initCopies,
resetRedTarget)
}
def finalizeReduction(body : ListBuffer[IR_Statement]) = {
// finalize reduction
......@@ -99,10 +106,10 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val src = IR_ArrayAccess(currCopy, idx)
IR_ForLoop(IR_VariableDeclaration(i, IR_IntegerConstant(0)), IR_Lower(i, mat.sizeM), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, 0), IR_Lower(j, mat.sizeN), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(dst, IR_BinaryOperators.createExpression(reduction.op, dst, src))))))
IR_Assignment(dst, IR_BinaryOperators.createExpression(red.op, dst, src))))))
case _ : IR_ScalarDatatype =>
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(reduction.op, redTarget, currCopy))
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(red.op, redTarget, currCopy))
}
body :+ assign
......@@ -110,21 +117,21 @@ case class CUDA_HandleFragmentLoopsWithReduction(
def replaceAccesses(body : ListBuffer[IR_Statement]) = {
// replace occurrences
CUDA_ReplaceReductionAccesses.redTarget = redTarget
CUDA_ReplaceReductionAccesses.replacement = currCopy
CUDA_ReplaceReduc