Commit 4c96a571 authored by Sara Faghih-Naini's avatar Sara Faghih-Naini
Browse files

Merge branch 'devel/fix_cuda_nodePos_comm' into 'master'

devel/fix_cuda_nodePos_comm

See merge request exastencils/exastencils!72
parents 7e484447 8d4071ca
...@@ -52,6 +52,7 @@ stages: ...@@ -52,6 +52,7 @@ stages:
- python3 --version - python3 --version
- mpirun --version - mpirun --version
- nvcc --version - nvcc --version
- nvidia-smi
- updatedb - updatedb
- locate cuda.h - locate cuda.h
- locate libcudart - locate libcudart
......
...@@ -325,10 +325,18 @@ object IR_DefaultLayerHandler extends IR_LayerHandler { ...@@ -325,10 +325,18 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
if (Knowledge.data_genVariableFieldSizes) if (Knowledge.data_genVariableFieldSizes)
IR_GenerateIndexManipFcts.apply() IR_GenerateIndexManipFcts.apply()
// adapt accesses to device data in case of managed memory
if (Knowledge.cuda_enabled && Knowledge.cuda_useManagedMemory)
CUDA_AdaptDeviceAccessesForMM.apply()
IR_AddInternalVariables.apply() IR_AddInternalVariables.apply()
// resolve possibly newly added constant IVs // resolve possibly newly added constant IVs
IR_ResolveConstIVs.apply() IR_ResolveConstIVs.apply()
// adapt allocations and de-allocations before expanding
if (Knowledge.cuda_enabled)
CUDA_AdaptAllocations.apply()
if (Knowledge.useFasterExpand) if (Knowledge.useFasterExpand)
IR_ExpandInOnePass.apply() IR_ExpandInOnePass.apply()
else else
......
...@@ -26,7 +26,6 @@ import exastencils.base.ir._ ...@@ -26,7 +26,6 @@ import exastencils.base.ir._
import exastencils.config.Knowledge import exastencils.config.Knowledge
import exastencils.datastructures._ import exastencils.datastructures._
import exastencils.logger.Logger import exastencils.logger.Logger
import exastencils.parallelization.api.cuda._
import exastencils.parallelization.api.mpi._ import exastencils.parallelization.api.mpi._
import exastencils.parallelization.api.omp.OMP_Parallel import exastencils.parallelization.api.omp.OMP_Parallel
...@@ -51,11 +50,6 @@ object IR_HandleMainApplication extends DefaultStrategy("HandleMainApplication") ...@@ -51,11 +50,6 @@ object IR_HandleMainApplication extends DefaultStrategy("HandleMainApplication")
func.body.append(IR_Native("LIKWID_MARKER_CLOSE")) func.body.append(IR_Native("LIKWID_MARKER_CLOSE"))
} }
if (Knowledge.cuda_enabled) {
func.body.prepend(CUDA_Init)
func.body.append(CUDA_Finalize)
}
if (Knowledge.mpi_enabled) { if (Knowledge.mpi_enabled) {
func.body.prepend(MPI_Init) func.body.prepend(MPI_Init)
func.body.append(MPI_Finalize) func.body.append(MPI_Finalize)
......
...@@ -38,7 +38,7 @@ case class IR_ArrayAllocation(var name : IR_Expression, // no string - could be ...@@ -38,7 +38,7 @@ case class IR_ArrayAllocation(var name : IR_Expression, // no string - could be
case class IR_ScalarAllocation(var name : IR_Expression, // no string - could be an IV case class IR_ScalarAllocation(var name : IR_Expression, // no string - could be an IV
var datatype : IR_Datatype, var datatype : IR_Datatype,
) extends IR_Statement { ) extends IR_Statement {
//override def datatype = IR_UnitDatatype //override def datatype = IR_UnitDatatype
override def prettyprint(out : PpStream) : Unit = out << name << " = " << "new" << ' ' << datatype << ";" override def prettyprint(out : PpStream) : Unit = out << name << " = " << "new" << ' ' << datatype << ";"
} }
...@@ -49,7 +49,6 @@ case class IR_ArrayFree(var pointer : IR_Expression) extends IR_Statement { ...@@ -49,7 +49,6 @@ case class IR_ArrayFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete[] " << pointer << ";" override def prettyprint(out : PpStream) : Unit = out << "delete[] " << pointer << ";"
} }
/// IR_ScalarFree /// IR_ScalarFree
case class IR_ScalarFree(var pointer : IR_Expression) extends IR_Statement { case class IR_ScalarFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete " << pointer << ";" override def prettyprint(out : PpStream) : Unit = out << "delete " << pointer << ";"
......
...@@ -515,6 +515,16 @@ object Knowledge { ...@@ -515,6 +515,16 @@ object Knowledge {
// apply spatial blocking with read-only cache // apply spatial blocking with read-only cache
var cuda_spatialBlockingWithROC : Boolean = false var cuda_spatialBlockingWithROC : Boolean = false
// use pinned memory to allocate host field data and buffers
var cuda_usePinnedHostMemory : Boolean = true
// use managed memory instead of host and device variants for field data and buffers
var cuda_useManagedMemory : Boolean = false
// replace device variants of field data and buffers with device pointers derived from host counter-parts
var cuda_useZeroCopy : Boolean = false
// only relevant if cuda_useManagedMemory == true; replace cuda memcpy with asynchronous prefetches
var cuda_genAsyncPrefetch : Boolean = true
// if true, the first dimension of the block size is enlarged if the kernel dimensionality is lower than the global dimensionality // if true, the first dimension of the block size is enlarged if the kernel dimensionality is lower than the global dimensionality
var cuda_foldBlockSizeForRedDimensionality : Boolean = true var cuda_foldBlockSizeForRedDimensionality : Boolean = true
...@@ -647,9 +657,6 @@ object Knowledge { ...@@ -647,9 +657,6 @@ object Knowledge {
var experimental_CTPivotElimination : Boolean = false var experimental_CTPivotElimination : Boolean = false
var experimental_QRPivot : Boolean = false var experimental_QRPivot : Boolean = false
// eliminate occurrences of cudaContext - required for PizDaint
var experimental_eliminateCudaContext : Boolean = false
// generate cuda kernels independently of them being parallel or not // generate cuda kernels independently of them being parallel or not
var experimental_cuda_generateKernelForNonParallel : Boolean = false var experimental_cuda_generateKernelForNonParallel : Boolean = false
...@@ -672,7 +679,6 @@ object Knowledge { ...@@ -672,7 +679,6 @@ object Knowledge {
var visit_enable : Boolean = false var visit_enable : Boolean = false
var experimental_visit_addCurveMesh : Boolean = false var experimental_visit_addCurveMesh : Boolean = false
/// === constraints and resolutions === /// === constraints and resolutions ===
def update() : Unit = { def update() : Unit = {
// NOTE: it is required to call update at least once // NOTE: it is required to call update at least once
...@@ -774,7 +780,13 @@ object Knowledge { ...@@ -774,7 +780,13 @@ object Knowledge {
Constraints.condWarn(cuda_enabled && opt_conventionalCSE && !useDblPrecision, "Double precision should be used if CUDA is enabled and CSE should be applied!") Constraints.condWarn(cuda_enabled && opt_conventionalCSE && !useDblPrecision, "Double precision should be used if CUDA is enabled and CSE should be applied!")
Constraints.condEnsureValue(useDblPrecision, true, cuda_enabled && opt_conventionalCSE) Constraints.condEnsureValue(useDblPrecision, true, cuda_enabled && opt_conventionalCSE)
Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(",") ) Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(","))
Constraints.condEnsureValue(cuda_usePinnedHostMemory, true, cuda_useZeroCopy)
Constraints.condError(cuda_useManagedMemory && cuda_usePinnedHostMemory, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condError(cuda_useManagedMemory && cuda_useZeroCopy, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condEnsureValue(data_alignFieldPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condEnsureValue(data_alignTmpBufferPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condWarn(experimental_splitLoopsForAsyncComm && !comm_onlyAxisNeighbors, s"Using asynchronous communication with comm_onlyAxisNeighbors leads to problems with stencils containing diagonal entries") Constraints.condWarn(experimental_splitLoopsForAsyncComm && !comm_onlyAxisNeighbors, s"Using asynchronous communication with comm_onlyAxisNeighbors leads to problems with stencils containing diagonal entries")
......
...@@ -203,7 +203,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables") ...@@ -203,7 +203,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
bufferAllocs += (id -> IR_LoopOverFragments( bufferAllocs += (id -> IR_LoopOverFragments(
IR_ArrayAllocation( IR_ArrayAllocation(
buf, buf,
if(buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype if (buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype
else IR_RealDatatype, else IR_RealDatatype,
size size
), IR_ParallelizationInfo(potentiallyParallel = true))) ), IR_ParallelizationInfo(potentiallyParallel = true)))
...@@ -259,9 +259,11 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables") ...@@ -259,9 +259,11 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
func.body += genericAlloc._2 func.body += genericAlloc._2
for (deviceAlloc <- deviceFieldAllocs.toSeq.sortBy(_._1) ++ deviceBufferAllocs.toSeq.sortBy(_._1)) for (deviceAlloc <- deviceFieldAllocs.toSeq.sortBy(_._1) ++ deviceBufferAllocs.toSeq.sortBy(_._1))
if ("Condition" == Knowledge.cuda_preferredExecution) if ("Condition" == Knowledge.cuda_preferredExecution) {
func.body += IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), deviceAlloc._2) val loop = deviceAlloc._2.asInstanceOf[IR_LoopOverFragments]
else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481 loop.body = ListBuffer(IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), loop.body))
func.body += loop
} else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481
func.body += IR_Scope(deviceAlloc._2) func.body += IR_Scope(deviceAlloc._2)
else else
func.body += deviceAlloc._2 func.body += deviceAlloc._2
......
...@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O ...@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
case ex : VectorizationException => case ex : VectorizationException =>
if (DEBUG) { if (DEBUG) {
val msg : String = "[vect] unable to vectorize loop: " + ex.msg + " (line " + ex.getStackTrace()(0).getLineNumber + ')' val msg : String = "[vect] unable to vectorize loop: " + ex.msg + " (line " + ex.getStackTrace()(0).getLineNumber + ')'
Logger.warn(msg)
println(msg) // print directly, logger may be silenced by any surrounding strategy println(msg) // print directly, logger may be silenced by any surrounding strategy
return List(IR_Comment(msg), node) return List(IR_Comment(msg), node)
} }
...@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O ...@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private var alignedResidue : Long = -1 private var alignedResidue : Long = -1
private val nameTempl : String = "_vec%02d" private val nameTempl : String = "_vec%02d"
private var reductionVarArrayAccesses : Option[IR_ArrayAccess] = None
// init // init
pushScope() pushScope()
...@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O ...@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
def getAlignedResidue() : Long = { def getAlignedResidue() : Long = {
alignedResidue alignedResidue
} }
def setReductionArrayAccess(arrAcc : IR_ArrayAccess) = {
reductionVarArrayAccesses = Some(arrAcc)
}
def getReductionArrayAccess() = {
reductionVarArrayAccesses
}
} }
private def containsVarAcc(node : IR_Node, varName : String) : Boolean = { private def containsVarAcc(node : IR_Node, varName : String) : Boolean = {
...@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O ...@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val ctx = new LoopCtx(itVar, incr) val ctx = new LoopCtx(itVar, incr)
var postLoopStmt : IR_Statement = null var postLoopStmt : IR_Statement = null
if (reduction.isDefined) { if (reduction.isDefined) {
val target = reduction.get.target val target = Duplicate(reduction.get.target)
val operator = reduction.get.op val operator = reduction.get.op
target match {
case arrAcc : IR_ArrayAccess => ctx.setReductionArrayAccess(arrAcc)
case _ =>
}
val (vecTmp : String, true) = ctx.getName(target) val (vecTmp : String, true) = ctx.getName(target)
val identityElem : IR_Expression = val identityElem : IR_Expression =
...@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O ...@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private def vectorizeExpr(expr : IR_Expression, ctx : LoopCtx) : IR_Expression = { private def vectorizeExpr(expr : IR_Expression, ctx : LoopCtx) : IR_Expression = {
expr match { expr match {
case arrAcc : IR_ArrayAccess if ctx.getReductionArrayAccess().contains(arrAcc) =>
// vec was already added to ctx and declared
val (vecTmp : String, false) = ctx.getName(expr)
IR_VariableAccess(vecTmp, SIMD_RealDatatype)
// TODO: do not vectorize if base is not aligned? // TODO: do not vectorize if base is not aligned?
case IR_ArrayAccess(base, index, alignedBase) => case IR_ArrayAccess(base, index, alignedBase) =>
val (vecTmp : String, njuTmp : Boolean) = ctx.getName(expr) val (vecTmp : String, njuTmp : Boolean) = ctx.getName(expr)
......
...@@ -31,19 +31,16 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") { ...@@ -31,19 +31,16 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
override def doWork() : Unit = { override def doWork() : Unit = {
val globals = IR_GlobalCollection.get val globals = IR_GlobalCollection.get
if (!Knowledge.experimental_eliminateCudaContext)
globals.variables += IR_VariableDeclaration("CUcontext", "cudaContext")
globals.variables += IR_VariableDeclaration("CUdevice", "cudaDevice")
val initFunc = globals.functions.find(_.name == "initGlobals").get.asInstanceOf[IR_Function] val initFunc = globals.functions.find(_.name == "initGlobals").get.asInstanceOf[IR_Function]
initFunc.body ++= ListBuffer[IR_Statement]( initFunc.body ++= ListBuffer[IR_Statement](
IR_VariableDeclaration(IR_IntegerDatatype, "deviceCount", 0), IR_VariableDeclaration(IR_IntegerDatatype, "deviceCount", 0),
"cuDeviceGetCount(&deviceCount)", "cudaGetDeviceCount(&deviceCount)",
IR_Assert(IR_Lower(Knowledge.cuda_deviceId, "deviceCount"), IR_Assert(IR_Lower(Knowledge.cuda_deviceId, "deviceCount"),
ListBuffer("\"Invalid device id (\"", Knowledge.cuda_deviceId, "\") must be smaller than the number of devices (\"", "deviceCount", "\")\""), ListBuffer("\"Invalid device id (\"", Knowledge.cuda_deviceId, "\") must be smaller than the number of devices (\"", "deviceCount", "\")\""),
IR_FunctionCall("exit", 1)), IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1)),
s"cuDeviceGet(&cudaDevice, ${ Knowledge.cuda_deviceId })") s"cudaSetDevice(${ Knowledge.cuda_deviceId })"
)
// print device info (name) // print device info (name)
if (!Knowledge.testing_enabled) { if (!Knowledge.testing_enabled) {
...@@ -53,10 +50,6 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") { ...@@ -53,10 +50,6 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
IR_RawPrint("\"Using CUDA device \"", Knowledge.cuda_deviceId, "\": \"", "devProp.name", "std::endl")) IR_RawPrint("\"Using CUDA device \"", Knowledge.cuda_deviceId, "\": \"", "devProp.name", "std::endl"))
} }
// create context
if (!Knowledge.experimental_eliminateCudaContext)
initFunc.body += "cuCtxCreate(&cudaContext, 0, cudaDevice)"
// set L1 cache and shared memory configuration for this device // set L1 cache and shared memory configuration for this device
if (Knowledge.cuda_useSharedMemory) if (Knowledge.cuda_useSharedMemory)
initFunc.body += "cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)" initFunc.body += "cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)"
......
...@@ -19,28 +19,8 @@ ...@@ -19,28 +19,8 @@
package exastencils.parallelization.api.cuda package exastencils.parallelization.api.cuda
import exastencils.base.ir._ import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.prettyprinting.PpStream import exastencils.prettyprinting.PpStream
/// CUDA_Init
case object CUDA_Init extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuInit(0);"
}
}
/// CUDA_Finalize
case object CUDA_Finalize extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
// has to be done after all other de-initialization statements
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuCtxDestroy(cudaContext);"
}
}
/// CUDA_DeviceSynchronize /// CUDA_DeviceSynchronize
case class CUDA_DeviceSynchronize() extends CUDA_HostStatement with IR_Expandable { case class CUDA_DeviceSynchronize() extends CUDA_HostStatement with IR_Expandable {
......
...@@ -48,16 +48,16 @@ case class CUDA_CheckError(var exp : IR_Expression) extends CUDA_HostStatement w ...@@ -48,16 +48,16 @@ case class CUDA_CheckError(var exp : IR_Expression) extends CUDA_HostStatement w
def print = IR_RawPrint("\"CUDA error in file (\"", "__FILE__", "\"), line (\"", "__LINE__", "\"): \"", status, def print = IR_RawPrint("\"CUDA error in file (\"", "__FILE__", "\"), line (\"", "__LINE__", "\"): \"", status,
"\" -> \"", IR_FunctionCall(IR_ExternalFunctionReference("cudaGetErrorString"), status), "std::endl") "\" -> \"", IR_FunctionCall(IR_ExternalFunctionReference("cudaGetErrorString"), status), "std::endl")
def printAndExit : ListBuffer[IR_Statement] = ListBuffer(print, IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
ListBuffer( ListBuffer(
IR_VariableDeclaration(status, exp), IR_VariableDeclaration(status, exp),
IR_IfCondition("cudaSuccess" Neq status, IR_IfCondition("cudaSuccess" Neq status,
print, printAndExit,
ListBuffer( ListBuffer(
IR_Assignment(status, IR_FunctionCall(IR_ExternalFunctionReference("cudaGetLastError"))), IR_Assignment(status, IR_FunctionCall(IR_ExternalFunctionReference("cudaGetLastError"))),
IR_IfCondition("cudaSuccess" Neq status, IR_IfCondition("cudaSuccess" Neq status,
ListBuffer[IR_Statement]( printAndExit
print,
IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
)))) ))))
} }
} }
...@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) => loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
// remove the annotation first to guarantee single application of this transformation. // remove the annotation first to guarantee single application of this transformation.
loop.annotate(CUDA_Util.CUDA_LOOP_ANNOTATION) loop.removeAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION)
val parallelLoops = (x : IR_ForLoop) => { val parallelLoops = (x : IR_ForLoop) => {
x.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) && x.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
...@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1 val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction val reduction = Duplicate(loop.parallelization.reduction)
val redTarget = if (reduction.isDefined)
Some(Duplicate(reduction.get.target))
else
None
// local variable for kernels with reductions // local variable for kernels with reductions
val localTarget = if (reduction.isDefined) val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target))) Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(redTarget.get)))
else else
None None
...@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses.clear() CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined) if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target) CUDA_GatherVariableAccesses.reductionTarget = redTarget
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop)) CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target // declare and init local reduction target
if (localTarget.isDefined) { if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get) var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match { var initLocalTarget = CUDA_Util.getReductionDatatype(redTarget.get) match {
case _ : IR_ScalarDatatype => case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target)) ListBuffer[IR_Statement](IR_Assignment(localTarget.get, redTarget.get))
case mat : IR_MatrixDatatype => case mat : IR_MatrixDatatype =>
reduction.get.target match { redTarget.get match {
case vAcc : IR_VariableAccess => case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer( IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
...@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments // replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions // reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined) if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target) CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = redTarget
else else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody)) CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
......
...@@ -23,6 +23,7 @@ import scala.collection.mutable._ ...@@ -23,6 +23,7 @@ import scala.collection.mutable._
import exastencils.base.ir._ import exastencils.base.ir._
import exastencils.core.collectors.Collector import exastencils.core.collectors.Collector
import exastencils.datastructures._ import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._ import exastencils.field.ir._
import exastencils.logger.Logger import exastencils.logger.Logger
...@@ -85,6 +86,12 @@ class CUDA_GatherFieldAccess extends Collector { ...@@ -85,6 +86,12 @@ class CUDA_GatherFieldAccess extends Collector {
} }
} }
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
if (isRead) if (isRead)
fieldAccesses.put("read_" + identifier, access) fieldAccesses.put("read_" + identifier, access)
if (isWrite) if (isWrite)
......
...@@ -25,6 +25,7 @@ import exastencils.base.ir._ ...@@ -25,6 +25,7 @@ import exastencils.base.ir._
import exastencils.config._ import exastencils.config._
import exastencils.datastructures.Transformation._ import exastencils.datastructures.Transformation._
import exastencils.datastructures._ import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._ import exastencils.field.ir._
import exastencils.optimization.ir.IR_SimplifyExpression import exastencils.optimization.ir.IR_SimplifyExpression
...@@ -49,6 +50,12 @@ object CUDA_GatherFieldAccessLike extends QuietDefaultStrategy("Gather local Fie ...@@ -49,6 +50,12 @@ object CUDA_GatherFieldAccessLike extends QuietDefaultStrategy("Gather local Fie
} }
} }
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
identifier identifier
} }
......
...@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction( ...@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val iter = IR_LoopOverFragments.defIt val iter = IR_LoopOverFragments.defIt
val redTarget = reduction.target val red = Duplicate(reduction)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget) val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(reduction.targetName) val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
val copies = { val copies = {
val innerDt = reductionDt match { val innerDt = reductionDt match {
case scalar : IR_ScalarDatatype => scalar case scalar : IR_ScalarDatatype => scalar
case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)