Commit 7a759c1f authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Use deep copies of reduction targets for safety.

parent 46a3df8e
......@@ -262,7 +262,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val ctx = new LoopCtx(itVar, incr)
var postLoopStmt : IR_Statement = null
if (reduction.isDefined) {
val target = reduction.get.target
val target = Duplicate(reduction.get.target)
val operator = reduction.get.op
val (vecTmp : String, true) = ctx.getName(target)
......
......@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction
val reduction = Duplicate(loop.parallelization.reduction)
val redTarget = if (reduction.isDefined)
Some(Duplicate(reduction.get.target))
else
None
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(redTarget.get)))
else
None
......@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.reductionTarget = redTarget
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
var initLocalTarget = CUDA_Util.getReductionDatatype(redTarget.get) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, redTarget.get))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
redTarget.get match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
......@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = redTarget
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
......
......@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val iter = IR_LoopOverFragments.defIt
val redTarget = Duplicate(reduction.target)
val red = Duplicate(reduction)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(reduction.targetName)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
val copies = {
val innerDt = reductionDt match {
case scalar : IR_ScalarDatatype => scalar
case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)
}
IR_VariableAccess(reduction.targetName + "_" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
IR_VariableAccess(red.targetName + "_fragCpy" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
}
val currCopy = IR_ArrayAccess(copies, iter)
......@@ -107,10 +108,10 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val src = IR_ArrayAccess(currCopy, idx)
IR_ForLoop(IR_VariableDeclaration(i, IR_IntegerConstant(0)), IR_Lower(i, mat.sizeM), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, 0), IR_Lower(j, mat.sizeN), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(dst, IR_BinaryOperators.createExpression(reduction.op, dst, src))))))
IR_Assignment(dst, IR_BinaryOperators.createExpression(red.op, dst, src))))))
case _ : IR_ScalarDatatype =>
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(reduction.op, redTarget, currCopy))
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(red.op, redTarget, currCopy))
}
body :+ assign
......@@ -118,21 +119,21 @@ case class CUDA_HandleFragmentLoopsWithReduction(
def replaceAccesses(body : ListBuffer[IR_Statement]) = {
// replace occurrences
CUDA_ReplaceReductionAccesses.redTarget = redTarget
CUDA_ReplaceReductionAccesses.replacement = currCopy
CUDA_ReplaceReductionAccesses.redTarget = Duplicate(redTarget)
CUDA_ReplaceReductionAccesses.replacement = Duplicate(currCopy)
CUDA_ReplaceReductionAccesses.applyStandalone(IR_Scope(body))
}
def addHandling(loop : IR_ForLoop) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
def addHandling(loop : IR_LoopOverFragments) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
override def expand() : OutputType = {
......
......@@ -593,12 +593,12 @@ case class CUDA_Kernel(
var body = ListBuffer[IR_Statement]()
if (reduction.isDefined) {
def target = reduction.get.target
def resultDt = CUDA_Util.getReductionDatatype(target)
def baseDt = resultDt.resolveBaseDatatype
val target = Duplicate(reduction.get.target)
val resultDt = CUDA_Util.getReductionDatatype(target)
val baseDt = resultDt.resolveBaseDatatype
def bufSize = requiredThreadsPerDim.product
def bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
val bufSize = requiredThreadsPerDim.product
val bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
var callArgsReduction = ListBuffer[IR_Expression](bufAccess, bufSize)
body += CUDA_Memset(bufAccess, 0, bufSize, resultDt)
......@@ -621,7 +621,7 @@ case class CUDA_Kernel(
IR_Return(Some(callDefaultReductionKernel))
})
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, target) // request reduction kernel and wrapper
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, Duplicate(target)) // request reduction kernel and wrapper
} else {
body += CUDA_FunctionCall(getKernelFctName, callArgs, numBlocksPerDim, numThreadsPerBlock)
}
......
......@@ -88,8 +88,8 @@ object CUDA_HandleReductions extends DefaultStrategy("Handle reductions in devic
}
// update local target
CUDA_ReplaceReductionAssignments.redTarget = target
CUDA_ReplaceReductionAssignments.replacement = kernel.localReductionTarget.get
CUDA_ReplaceReductionAssignments.redTarget = Duplicate(target)
CUDA_ReplaceReductionAssignments.replacement = Duplicate(kernel.localReductionTarget.get)
CUDA_ReplaceReductionAssignments.applyStandalone(IR_Scope(kernel.body))
// set element in global reduction buffer to local result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment