Commit c312aadd authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Merge branch 'devel/fix_cuda_reduction_vect' into 'devel/cuda-enhancements'

# Conflicts:
#   .gitlab-ci.yml
parents f620c1ea a1d0f525
This diff is collapsed.
......@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
case ex : VectorizationException =>
if (DEBUG) {
val msg : String = "[vect] unable to vectorize loop: " + ex.msg + " (line " + ex.getStackTrace()(0).getLineNumber + ')'
Logger.warn(msg)
println(msg) // print directly, logger may be silenced by any surrounding strategy
return List(IR_Comment(msg), node)
}
......@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private var alignedResidue : Long = -1
private val nameTempl : String = "_vec%02d"
private var reductionVarArrayAccesses : Option[IR_ArrayAccess] = None
// init
pushScope()
......@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
def getAlignedResidue() : Long = {
alignedResidue
}
def setReductionArrayAccess(arrAcc : IR_ArrayAccess) = {
reductionVarArrayAccesses = Some(arrAcc)
}
def getReductionArrayAccess() = {
reductionVarArrayAccesses
}
}
private def containsVarAcc(node : IR_Node, varName : String) : Boolean = {
......@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val ctx = new LoopCtx(itVar, incr)
var postLoopStmt : IR_Statement = null
if (reduction.isDefined) {
val target = reduction.get.target
val target = Duplicate(reduction.get.target)
val operator = reduction.get.op
target match {
case arrAcc : IR_ArrayAccess => ctx.setReductionArrayAccess(arrAcc)
case _ =>
}
val (vecTmp : String, true) = ctx.getName(target)
val identityElem : IR_Expression =
......@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private def vectorizeExpr(expr : IR_Expression, ctx : LoopCtx) : IR_Expression = {
expr match {
case arrAcc : IR_ArrayAccess if ctx.getReductionArrayAccess().contains(arrAcc) =>
// vec was already added to ctx and declared
val (vecTmp : String, false) = ctx.getName(expr)
IR_VariableAccess(vecTmp, SIMD_RealDatatype)
// TODO: do not vectorize if base is not aligned?
case IR_ArrayAccess(base, index, alignedBase) =>
val (vecTmp : String, njuTmp : Boolean) = ctx.getName(expr)
......
......@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
// remove the annotation first to guarantee single application of this transformation.
loop.annotate(CUDA_Util.CUDA_LOOP_ANNOTATION)
loop.removeAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION)
val parallelLoops = (x : IR_ForLoop) => {
x.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
......@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction
val reduction = Duplicate(loop.parallelization.reduction)
val redTarget = if (reduction.isDefined)
Some(Duplicate(reduction.get.target))
else
None
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(redTarget.get)))
else
None
......@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.reductionTarget = redTarget
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
var initLocalTarget = CUDA_Util.getReductionDatatype(redTarget.get) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, redTarget.get))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
redTarget.get match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
......@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = redTarget
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
......
......@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val iter = IR_LoopOverFragments.defIt
val redTarget = reduction.target
val red = Duplicate(reduction)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(reduction.targetName)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
val copies = {
val innerDt = reductionDt match {
case scalar : IR_ScalarDatatype => scalar
case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)
}
IR_VariableAccess(reduction.targetName + "_" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
IR_VariableAccess(red.targetName + "_fragCpy" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
}
val currCopy = IR_ArrayAccess(copies, iter)
......@@ -80,11 +81,17 @@ case class CUDA_HandleFragmentLoopsWithReduction(
matrixAssignment("std::copy", redTarget, currCopy, hodt.getSizeArray.product)
}
def initCopies() = ListBuffer(
IR_VariableDeclaration(copies), // declare copies
IR_LoopOverFragments( // init copies
copyReductionTarget()),
resetReductionTarget()) // reset initial value as it is already in the copies
def initCopies() = {
val declCopies = IR_VariableDeclaration(copies)
val initCopies = IR_LoopOverFragments(
copyReductionTarget()).expandSpecial().inner
val resetRedTarget = resetReductionTarget() // reset initial value as it is already in the copies
ListBuffer(
declCopies,
initCopies,
resetRedTarget)
}
def finalizeReduction(body : ListBuffer[IR_Statement]) = {
// finalize reduction
......@@ -99,10 +106,10 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val src = IR_ArrayAccess(currCopy, idx)
IR_ForLoop(IR_VariableDeclaration(i, IR_IntegerConstant(0)), IR_Lower(i, mat.sizeM), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, 0), IR_Lower(j, mat.sizeN), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(dst, IR_BinaryOperators.createExpression(reduction.op, dst, src))))))
IR_Assignment(dst, IR_BinaryOperators.createExpression(red.op, dst, src))))))
case _ : IR_ScalarDatatype =>
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(reduction.op, redTarget, currCopy))
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(red.op, redTarget, currCopy))
}
body :+ assign
......@@ -110,21 +117,21 @@ case class CUDA_HandleFragmentLoopsWithReduction(
def replaceAccesses(body : ListBuffer[IR_Statement]) = {
// replace occurrences
CUDA_ReplaceReductionAccesses.redTarget = redTarget
CUDA_ReplaceReductionAccesses.replacement = currCopy
CUDA_ReplaceReductionAccesses.redTarget = Duplicate(redTarget)
CUDA_ReplaceReductionAccesses.replacement = Duplicate(currCopy)
CUDA_ReplaceReductionAccesses.applyStandalone(IR_Scope(body))
}
def addHandling(loop : IR_ForLoop) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
def addHandling(loop : IR_LoopOverFragments) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
override def expand() : OutputType = {
......
......@@ -593,12 +593,12 @@ case class CUDA_Kernel(
var body = ListBuffer[IR_Statement]()
if (reduction.isDefined) {
def target = reduction.get.target
def resultDt = CUDA_Util.getReductionDatatype(target)
def baseDt = resultDt.resolveBaseDatatype
val target = Duplicate(reduction.get.target)
val resultDt = CUDA_Util.getReductionDatatype(target)
val baseDt = resultDt.resolveBaseDatatype
def bufSize = requiredThreadsPerDim.product
def bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
val bufSize = requiredThreadsPerDim.product
val bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
var callArgsReduction = ListBuffer[IR_Expression](bufAccess, bufSize)
body += CUDA_Memset(bufAccess, 0, bufSize, resultDt)
......@@ -621,7 +621,7 @@ case class CUDA_Kernel(
IR_Return(Some(callDefaultReductionKernel))
})
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, target) // request reduction kernel and wrapper
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, Duplicate(target)) // request reduction kernel and wrapper
} else {
body += CUDA_FunctionCall(getKernelFctName, callArgs, numBlocksPerDim, numThreadsPerBlock)
}
......
......@@ -88,8 +88,8 @@ object CUDA_HandleReductions extends DefaultStrategy("Handle reductions in devic
}
// update local target
CUDA_ReplaceReductionAssignments.redTarget = target
CUDA_ReplaceReductionAssignments.replacement = kernel.localReductionTarget.get
CUDA_ReplaceReductionAssignments.redTarget = Duplicate(target)
CUDA_ReplaceReductionAssignments.replacement = Duplicate(kernel.localReductionTarget.get)
CUDA_ReplaceReductionAssignments.applyStandalone(IR_Scope(kernel.body))
// set element in global reduction buffer to local result
......
......@@ -90,10 +90,12 @@ def run_test(generator_path: str, problem_name: str, knowledge_path: str, exa_fi
elif expected_results_path:
result_str = result.stdout.decode('utf-8')
if check_results(result_str, expected_results_path) is True:
print(f"Test for problem \"{problem_name}\" finished successfully.")
return result.returncode
else:
return -1
else:
print(f"Test for problem \"{problem_name}\" finished successfully.")
return result.returncode
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment