Commit b11f6489 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Distinguish between (non-)evaluable array/matrix accesses and setup device...

Distinguish between (non-)evaluable array/matrix accesses and setup device copy in the non-evaluable case.
parent 5a953879
......@@ -103,12 +103,35 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// add kernel and kernel call
val kernelFunctions = CUDA_KernelFunctions.get
val kernelCount = kernelFunctions.counterMap.getOrElse(collector.getCurrentName, -1) + 1
// collect local accesses because these variables need to be passed to the kernel at call
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (loop.parallelization.reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(loop.parallelization.reduction.get.target)
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
val accesses = CUDA_GatherVariableAccesses.accesses.toSeq.sortBy(_._1).to[ListBuffer]
val accesses = CUDA_GatherVariableAccesses.evaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
val accessesCopiedToDevice = CUDA_GatherVariableAccesses.nonEvaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
// add non-evaluable accesses in form of pointers to device copies
val deviceArrayCopies = accessesCopiedToDevice.map {
case (k,v) =>
val copyName = CUDA_GatherVariableAccesses.arrayVariableAccessAsString(v._1)
val copyDt = IR_PointerDatatype(v._2.resolveBaseDatatype)
(k, IR_VariableAccess(copyName, copyDt))
}.toMap
// parameters of the kernel
val params = ListBuffer[IR_FunctionArgument]()
params ++= accesses.map { case (name, tup) => IR_FunctionArgument(name, tup._2) }
params ++= deviceArrayCopies.values.map(IR_FunctionArgument(_))
// args passed to kernel
val args = ListBuffer[IR_Expression]()
args ++= accesses.map { case (_, tup) => tup._1 : IR_Expression }
args ++= deviceArrayCopies.values
var extremaMap = mutable.HashMap[String, (Long, Long)]()
......@@ -119,6 +142,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
IR_InlineMatSolveStmts.applyStandalone(kernelBody)
// replace array accesses with accesses to function arguments, ignore reduction variable
CUDA_ReplaceArrayAccesses.kernelCount = kernelCount
if (loop.parallelization.reduction.isDefined)
CUDA_ReplaceArrayAccesses.reductionTarget = Some(loop.parallelization.reduction.get.target)
else
......@@ -128,7 +152,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernel = CUDA_Kernel(
kernelFunctions.getIdentifier(collector.getCurrentName),
parallelInnerLoops.length,
accesses.map { case (name, tup) => IR_FunctionArgument(name, tup._2) },
params,
Duplicate(loopVariables),
Duplicate(lowerBounds),
Duplicate(upperBounds),
......@@ -139,8 +163,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
kernelFunctions.addKernel(Duplicate(kernel))
// copy array variables from host to device if necessary
if (deviceArrayCopies.nonEmpty) {
deviceArrayCopies foreach { case (k, dstArr) =>
val (srcArr, srcDt) = accessesCopiedToDevice.find(_._1 == k).get._2
deviceStatements += IR_VariableDeclaration(dstArr)
deviceStatements += CUDA_Memcpy(dstArr, srcArr, srcDt.typicalByteSize, "cudaMemcpyHostToDevice")
}
}
// process return value of kernel wrapper call if reduction is required
val callKernel = IR_FunctionCall(kernel.getWrapperFctName, accesses.map { case (_, tup) => tup._1 : IR_Expression })
val callKernel = IR_FunctionCall(kernel.getWrapperFctName, args)
if (loop.parallelization.reduction.isDefined) {
val red = loop.parallelization.reduction.get
CUDA_Util.getReductionDatatype(red.target) match {
......@@ -174,6 +207,10 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
deviceStatements += callKernel
}
// destroy device copies
if (deviceArrayCopies.nonEmpty)
deviceStatements ++= deviceArrayCopies.keys.map(CUDA_Free(_))
deviceStatements
}, false)
}
......@@ -24,19 +24,46 @@ import exastencils.base.ir._
import exastencils.baseExt.ir.IR_LoopOverFragments
import exastencils.config.Knowledge
import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.optimization.ir.EvaluationException
import exastencils.optimization.ir.IR_SimplifyExpression
import exastencils.parallelization.api.cuda.CUDA_Util._
object CUDA_GatherVariableAccesses extends QuietDefaultStrategy("Gather local VariableAccess nodes") {
var reductionTarget : Option[IR_Expression] = None
var kernelCount : Int = 0
var accesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
var evaluableAccesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
var nonEvaluableAccesses = mutable.HashMap[String, (IR_VariableAccess, IR_Datatype)]()
var ignoredAccesses = mutable.SortedSet[String]()
var ignoredMatrixVariableAccesses = mutable.SortedSet[String]()
def arrayAccessAsString(base : IR_VariableAccess, idx : IR_Expression) = base.name + idx.prettyprint()
def containsArrayAccess(base : IR_VariableAccess, idx : IR_Expression) = accesses.contains(arrayAccessAsString(base, idx))
var ignoredArrayVariableAccesses = mutable.SortedSet[String]()
def basePrefix(base : IR_VariableAccess) = base.name
// regular, evaluable indexed array accesses
def arrayAccessAsString(base : IR_VariableAccess, idx : IR_Expression) = basePrefix(base) + idx.prettyprint()
def containsArrayAccess(base : IR_VariableAccess, idx : IR_Expression) = evaluableAccesses.contains(arrayAccessAsString(base, idx))
// array variable accesses in case that a kernel is passed whole array as argument (for non-evaluable indices)
def arrayVariableAccessAsString(base : IR_VariableAccess) = s"${basePrefix(base)}_deviceCopy_$kernelCount"
def containsArrayVariableAccess(base : IR_VariableAccess) = nonEvaluableAccesses.contains(arrayVariableAccessAsString(base))
def isReplaceable(base : IR_VariableAccess, idx : IR_Expression) =
containsArrayAccess(base, idx) || containsArrayVariableAccess(base)
def replaceAccess(base : IR_VariableAccess, idx : IR_Expression) : Option[IR_Expression] = {
if (isReplaceable(base, idx)) {
if (containsArrayAccess(base, idx)) {
val name = arrayAccessAsString(base, idx)
Some(IR_VariableAccess(name, evaluableAccesses(name)._2))
} else if (containsArrayVariableAccess(base)) {
val name = arrayVariableAccessAsString(base)
Some(IR_ArrayAccess(IR_VariableAccess(name, base.datatype), idx))
} else {
Logger.error("Error while gathering variables for CUDA kernels")
}
} else {
None
}
}
def isEvaluable(idx : IR_Expression) = {
var ret = true
......@@ -53,8 +80,9 @@ object CUDA_GatherVariableAccesses extends QuietDefaultStrategy("Gather local Va
def clear() = {
reductionTarget = None
accesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
ignoredMatrixVariableAccesses = mutable.SortedSet[String]()
evaluableAccesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
nonEvaluableAccesses = mutable.HashMap[String, (IR_VariableAccess, IR_Datatype)]()
ignoredArrayVariableAccesses = mutable.SortedSet[String]()
ignoredAccesses = mutable.SortedSet[String]()
ignoredAccesses += "std::cout"
ignoredAccesses += "std::cerr"
......@@ -67,24 +95,35 @@ object CUDA_GatherVariableAccesses extends QuietDefaultStrategy("Gather local Va
decl
case arrAcc @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if !ignoredAccesses.contains(base.name) =>
ignoredMatrixVariableAccesses += base.name
if (isEvaluable(idx))
accesses.put(arrayAccessAsString(base, idx), (arrAcc, base.datatype.resolveBaseDatatype))
ignoredArrayVariableAccesses += base.name
if (isEvaluable(idx)) {
// single, evaluable array accesses -> count "base[idx]" as variable access
evaluableAccesses.put(arrayAccessAsString(base, idx), (arrAcc, base.datatype.resolveBaseDatatype))
} else {
// we found a non-evaluable index -> remove previous evaluable accesses
evaluableAccesses.foreach {
case (k, _) if k.startsWith(basePrefix(base)) && k.length > basePrefix(base).length => evaluableAccesses.remove(k)
case _ =>
}
// copy "base" to device data and pass device pointer to the kernel -> count as single variable access to "base"
nonEvaluableAccesses.put(arrayVariableAccessAsString(base), (base, base.datatype))
}
// it can happen that no fragmentIdx is accessed in a loop, but the resulting CudaReductionBuffer requires it
if (Knowledge.domain_numFragmentsPerBlock > 1 && isReductionVariableAccess(reductionTarget, arrAcc))
accesses.put(fragIdx.name, (fragIdx, fragIdx.datatype))
evaluableAccesses.put(fragIdx.name, (fragIdx, fragIdx.datatype))
arrAcc
case vAcc : IR_VariableAccess if !ignoredAccesses.contains(vAcc.name) && !ignoredMatrixVariableAccesses.contains(vAcc.name) =>
accesses.put(vAcc.name, (vAcc, vAcc.datatype))
case vAcc : IR_VariableAccess if !ignoredAccesses.contains(vAcc.name) && !ignoredArrayVariableAccesses.contains(vAcc.name) =>
evaluableAccesses.put(vAcc.name, (vAcc, vAcc.datatype))
vAcc
// same phenomenon: fragmentIdx is required by CudaReductionBuffer, but not present in loop body
case expr : IR_Expression if Knowledge.domain_numFragmentsPerBlock > 1 && isReductionTarget(reductionTarget, expr) =>
accesses.put(fragIdx.name, (fragIdx, fragIdx.datatype))
evaluableAccesses.put(fragIdx.name, (fragIdx, fragIdx.datatype))
expr
})
}
......@@ -573,8 +573,7 @@ case class CUDA_Kernel(var identifier : String,
callArgs += CUDA_FieldDeviceData(fieldAccess._2.field, Duplicate(fieldAccess._2.slot), Duplicate(fieldAccess._2.fragIdx))
if (Knowledge.cuda_useSharedMemory && fieldForSharedMemory.nonEmpty) {
fieldNames.foreach(field => callArgs += CUDA_FieldDeviceData(fieldForSharedMemory(field).field, Duplicate(fieldForSharedMemory(field).slot))
)
fieldNames.foreach(field => callArgs += CUDA_FieldDeviceData(fieldForSharedMemory(field).field, Duplicate(fieldForSharedMemory(field).slot)))
}
for (bufferAccess <- bufferAccesses) {
......
......@@ -11,17 +11,19 @@ import exastencils.parallelization.api.cuda.CUDA_Util._
object CUDA_ReplaceArrayAccesses extends QuietDefaultStrategy("Replace array accesses in kernel") {
var reductionTarget : Option[IR_Expression] = None
var kernelCount : Int = 0
object CUDA_ReplaceReductionVariableAccesses extends QuietDefaultStrategy("") {
this += Transformation("Replace accesses to reduction vars", {
case arrAcc @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if CUDA_GatherVariableAccesses.containsArrayAccess(base, idx) =>
val name = CUDA_GatherVariableAccesses.arrayAccessAsString(base, idx)
val acc = CUDA_GatherVariableAccesses.accesses(name)
private val compoundAssignmentOps = List("+=", "-=", "*=", "/=")
object CUDA_ReplaceNonReductionVariableAccesses extends QuietDefaultStrategy("Replace accesses to non-reduction vars") {
/* replace array access of non reduction targets as follows:
- evaluable index -> variable access (kernel argument)
- non-evaluable index -> change base of array access to a device copy (pointer passed as kernel argument) */
this += Transformation("..", {
case arrAcc @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if CUDA_GatherVariableAccesses.isReplaceable(base, idx) =>
// replace array access if not the reduction target
if (!isReductionTarget(reductionTarget, arrAcc)) {
IR_VariableAccess(name, acc._2)
CUDA_GatherVariableAccesses.replaceAccess(base, idx).get
} else {
arrAcc
}
......@@ -29,40 +31,51 @@ object CUDA_ReplaceArrayAccesses extends QuietDefaultStrategy("Replace array acc
}
this += new Transformation("Searching", {
// handling for assignments containing the reduction variable
case IR_Assignment(dst @ IR_ArrayAccess(base : IR_VariableAccess, idx, _), src, op) if List("+=", "-=", "*=", "/=").contains(op) && isReductionVariableAccess(reductionTarget, dst) =>
/* handling for assignments containing the reduction variable
-> do not replace the reduction variable on the lhs for the sake of another replacement strategy (CUDA_HandleReductions),
where reduction variable access are replaced with device reduction buffer accesses
-> only replace for the rhs (reduction target accesses are replaced with kernel arg accesses) */
case assign @ IR_Assignment(dst @ IR_ArrayAccess(base : IR_VariableAccess, idx, _), src, op) if compoundAssignmentOps.contains(op) && isReductionVariableAccess(reductionTarget, dst) =>
// handling for compound assignments
// only replace access for rhs
val name = CUDA_GatherVariableAccesses.arrayAccessAsString(base, idx)
val acc = CUDA_GatherVariableAccesses.accesses(name)
val replacement = IR_VariableAccess(name, acc._2)
IR_Assignment(dst, IR_BinaryOperators.createExpression(op.replace("=", ""), replacement, src))
if (CUDA_GatherVariableAccesses.isReplaceable(base, idx))
IR_Assignment(dst, IR_BinaryOperators.createExpression(op.replace("=", ""), CUDA_GatherVariableAccesses.replaceAccess(base, idx).get, src))
else
assign
case _ @ IR_Assignment(dst @ IR_ArrayAccess(_ : IR_VariableAccess, _, _), src, "=") if isReductionVariableAccess(reductionTarget, dst) =>
// allow reduction variable on rhs to be replaced
// regular assignments: only allow reduction variable on rhs to be replaced
src match {
// replace immediately
case _ @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if isReductionVariableAccess(reductionTarget, dst) && CUDA_GatherVariableAccesses.containsArrayAccess(base, idx) =>
val name = CUDA_GatherVariableAccesses.arrayAccessAsString(base, idx)
val acc = CUDA_GatherVariableAccesses.accesses(name)
val replacement = IR_VariableAccess(name, acc._2)
IR_Assignment(dst, replacement)
case _ @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if CUDA_GatherVariableAccesses.isReplaceable(base, idx) =>
IR_Assignment(dst, CUDA_GatherVariableAccesses.replaceAccess(base, idx).get)
// traverse tree and replace
case _ : IR_Expression =>
// allow replacement of reduction variables on rhs
val tmp = Duplicate(reductionTarget.get)
reductionTarget = None
CUDA_ReplaceReductionVariableAccesses.applyStandalone(src)
CUDA_ReplaceNonReductionVariableAccesses.applyStandalone(src)
reductionTarget = Some(tmp)
IR_Assignment(dst, src)
}
/* lhs is not the reduction variable:
-> no special handling needed
-> simply replace accesses on rhs */
case assign @ IR_Assignment(dst, src, _) =>
var tmp : Option[IR_Expression] = None
if (reductionTarget.isDefined)
tmp = Some(Duplicate(reductionTarget.get))
reductionTarget = None
CUDA_ReplaceNonReductionVariableAccesses.applyStandalone(src)
reductionTarget = tmp
assign
// special case: unresolved matrix expressions
case e : IR_MatrixExpression =>
// traverse tree and replace
CUDA_ReplaceReductionVariableAccesses.applyStandalone(e)
CUDA_ReplaceNonReductionVariableAccesses.applyStandalone(e)
e
})
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment