CUDA_HandleFragmentLoopsWithReduction.scala 5.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
package exastencils.parallelization.api.cuda

import scala.collection.mutable
import scala.collection.mutable.ListBuffer

import exastencils.base.ir.IR_ImplicitConversion._
import exastencils.base.ir._
import exastencils.baseExt.ir._
import exastencils.config.Knowledge
import exastencils.core.Duplicate
import exastencils.datastructures.QuietDefaultStrategy
import exastencils.datastructures.Transformation
import exastencils.datastructures.Transformation.OutputType
import exastencils.logger.Logger
import exastencils.parallelization.ir.IR_HasParallelizationInfo

/// CUDA_HandleFragmentLoopsWithReduction
// - for multi-fragment reductions
// - uses fragment-local copies of the reduction variable's initial value
// - otherwise an updated value, which was previously computed from another fragment,
//   may cause an over-accumulation when performing the reduction in the kernel with the updated value

object CUDA_HandleFragmentLoopsWithReduction {

  private var reductionCounters : mutable.HashMap[String, Int] = mutable.HashMap()

  def getReductionCounter(targetName : String) = {
    val c = reductionCounters.getOrElseUpdate(targetName, 0)
    reductionCounters(targetName) += 1
    c
  }
}

case class CUDA_HandleFragmentLoopsWithReduction(
    var fragLoop : IR_ScopedStatement with IR_HasParallelizationInfo,
    var reduction: IR_Reduction
) extends IR_Statement with IR_Expandable {

  val iter = IR_LoopOverFragments.defIt

41
42
  val red = Duplicate(reduction)
  val redTarget = Duplicate(red.target)
43
44
  val reductionDt = CUDA_Util.getReductionDatatype(redTarget)

45
  val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
46
47
48
49
50
51

  val copies = {
    val innerDt = reductionDt match {
      case scalar : IR_ScalarDatatype => scalar
      case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)
    }
52
    IR_VariableAccess(red.targetName + "_fragCpy" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
53
54
55
56
57
58
59
60
61
  }
  val currCopy = IR_ArrayAccess(copies, iter)

  // replace occurrences with copy
  private object CUDA_ReplaceReductionAccesses extends QuietDefaultStrategy("Replace accesses to reduction targets") {
    var redTarget : IR_Expression = IR_NullExpression
    var replacement : IR_Expression = IR_NullExpression

    this += new Transformation("Replace", {
62
      case red : IR_Reduction => red
63
      case expr : IR_Expression if expr == redTarget => Duplicate(replacement)
64
    }, recursive = false)
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  }

  def matrixAssignment(stdFunc : String, dst : IR_Expression, src : IR_Expression, size : Int) =
    IR_ExpressionStatement(IR_FunctionCall(IR_ExternalFunctionReference(stdFunc, IR_UnitDatatype),
      ListBuffer[IR_Expression](Duplicate(dst), Duplicate(dst) + IR_IntegerConstant(size), src)))

  def resetReductionTarget() = reductionDt match {
    case _ : IR_ScalarDatatype =>
      IR_Assignment(redTarget, 0)
    case hodt : IR_HigherDimensionalDatatype =>
      matrixAssignment("std::fill", redTarget, 0.0, hodt.getSizeArray.product)
  }

  def copyReductionTarget() = reductionDt match {
    case _ : IR_ScalarDatatype =>
      IR_Assignment(currCopy, redTarget)
    case hodt : IR_HigherDimensionalDatatype =>
82
      matrixAssignment("std::copy", redTarget, currCopy, hodt.getSizeArray.product)
83
84
  }

85
86
87
  def initCopies() =  {
    val declCopies = IR_VariableDeclaration(copies)
    val initCopies = IR_LoopOverFragments(
88
      copyReductionTarget()).expandSpecial().inner
89
90
    val resetRedTarget = resetReductionTarget() // reset initial value as it is already in the copies

91
    initCopies.parallelization.noVect = true
92
93
94
95
96
97

    ListBuffer(
      declCopies,
      initCopies,
      resetRedTarget)
  }
98
99
100
101
102
103
104
105
106
107
108
109
110
111

  def finalizeReduction(body : ListBuffer[IR_Statement]) = {
    // finalize reduction
    val assign = reductionDt match {
      case mat : IR_MatrixDatatype =>

        // update reduction target
        val i = IR_VariableAccess("_i", IR_IntegerDatatype)
        val j = IR_VariableAccess("_j", IR_IntegerDatatype)
        val idx = i * mat.sizeN + j
        val dst = IR_ArrayAccess(redTarget, idx)
        val src = IR_ArrayAccess(currCopy, idx)
        IR_ForLoop(IR_VariableDeclaration(i, IR_IntegerConstant(0)), IR_Lower(i, mat.sizeM), IR_PreIncrement(i), ListBuffer[IR_Statement](
          IR_ForLoop(IR_VariableDeclaration(j, 0), IR_Lower(j, mat.sizeN), IR_PreIncrement(j), ListBuffer[IR_Statement](
112
            IR_Assignment(dst, IR_BinaryOperators.createExpression(red.op, dst, src))))))
113
114

      case _ : IR_ScalarDatatype =>
115
        IR_Assignment(redTarget, IR_BinaryOperators.createExpression(red.op, redTarget, currCopy))
116
117
118
119
120
121
122
    }

    body :+ assign
  }

  def replaceAccesses(body : ListBuffer[IR_Statement]) = {
    // replace occurrences
123
124
    CUDA_ReplaceReductionAccesses.redTarget = Duplicate(redTarget)
    CUDA_ReplaceReductionAccesses.replacement = Duplicate(currCopy)
125
126
127
128
129
130
    CUDA_ReplaceReductionAccesses.applyStandalone(IR_Scope(body))
  }

  def addHandling(loop : IR_ForLoop) = {
    replaceAccesses(loop.body)
    loop.body = finalizeReduction(loop.body)
131
    initCopies() :+ Duplicate(loop)
132
133
134
135
136
  }

  def addHandling(loop : IR_LoopOverFragments) = {
    replaceAccesses(loop.body)
    loop.body = finalizeReduction(loop.body)
137
    initCopies() :+ Duplicate(loop)
138
139
140
141
142
143
144
145
146
147
  }

  override def expand() : OutputType = {
    fragLoop match {
      case loop : IR_LoopOverFragments => addHandling(loop)
      case loop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if name == iter.name => addHandling(loop)
      case _ => Logger.error("Invalid argument for \"fragLoop\" passed to CUDA_HandleFragmentLoopsWithReduction")
    }
  }
}