Commit c2c43e32 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Merge remote-tracking branch 'origin/master' into devel/matrix_init_fix

parents d9d5039b e5f5e5c9
This diff is collapsed.
......@@ -52,7 +52,9 @@ import exastencils.stencil.ir._
import exastencils.timing.ir._
import exastencils.util._
import exastencils.util.ir._
import exastencils.visualization.ir._
import exastencils.visualization.ir.cimg.IR_ResolveCImgFunctions
import exastencils.visualization.ir.visit.IR_SetupVisit
import exastencils.visualization.ir.vtk.IR_ResolveVtkPrinters
/// IR_LayerHandler
......@@ -169,6 +171,9 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
// simplify indices modified just now, otherwise equality checks will not work later on
IR_GeneralSimplify.apply()
if (Knowledge.visit_enable)
IR_SetupVisit.apply()
var convChanged = false
do {
IR_FindStencilConvolutions.changed = false
......@@ -185,9 +190,6 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
IR_ResolveStencilFunction.apply()
if (Knowledge.experimental_visit_enable)
IR_SetupVisit.apply()
// resolve new virtual field accesses
IR_ResolveIntegrateOnGrid.apply()
IR_ResolveEvaluateOnGrid.apply()
......
......@@ -21,7 +21,8 @@ package exastencils.applications.ir
import scala.collection.mutable.ListBuffer
import exastencils.base.ir.IR_ImplicitConversion._
import exastencils.base.ir.{ IR_Native, _ }
import exastencils.base.ir.IR_Native
import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.datastructures._
import exastencils.logger.Logger
......
......@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import exastencils.field.ir._
import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkQuads
import exastencils.visualization.ir.vtk.IR_PrintVtkQuads
/// IR_PrintVtkNNF
......
......@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import exastencils.field.ir._
import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkQuads
import exastencils.visualization.ir.vtk.IR_PrintVtkQuads
/// IR_PrintVtkNS
......
......@@ -32,7 +32,7 @@ import exastencils.grid.ir.IR_AtNode
import exastencils.logger.Logger
import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkTriangles
import exastencils.visualization.ir.vtk.IR_PrintVtkTriangles
/// IR_PrintVtkSWE
......
......@@ -311,6 +311,20 @@ object IR_GenerateBasicMatrixOperations {
stmts
}
// copy a submatrix of n_rows x n_cols to 'copy' from position 'offset_r', 'offset_c' in 'source' with size 'sourcesize'
def loopCompoundAssignSubmatrixPointer(source : IR_Expression, sourcesize : IR_Expression, dest : IR_Expression, offset_r : IR_Expression, offset_c : IR_Expression, n_rows : IR_Expression, n_cols : IR_Expression, op : String) : IR_Scope = {
var stmts = IR_Scope(Nil)
var i = IR_VariableAccess("i", IR_IntegerDatatype)
var j = IR_VariableAccess("j", IR_IntegerDatatype)
stmts.body += IR_ForLoop(IR_VariableDeclaration(i, offset_r), IR_Lower(i, n_rows + offset_r), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, offset_c), IR_Lower(j, offset_c + n_cols), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(IR_ArrayAccess(dest, (i - offset_r) * n_cols + j - offset_c),
IR_BinaryOperators.createExpression(op, IR_ArrayAccess(dest, (i - offset_r) * n_cols + j - offset_c), IR_ArrayAccess(source, i * sourcesize + j)))
))
))
stmts
}
// write a submatrix 'source' of n_rows x n_cols to 'destination' at position 'offset_r', 'offset_c'
def loopSetSubmatrixMat(source : IR_Expression, destination : IR_Expression, rows_source : IR_Expression, cols_source : IR_Expression, offset_r : IR_Expression, offset_c : IR_Expression) : IR_Scope = {
if (!isScalar(offset_r) || !isScalar(offset_c))
......
......@@ -668,10 +668,10 @@ object Knowledge {
var experimental_grid_randomMaxOffset : Double = 0.1
/// student project - Richard / visit
// in-situ visualization with VisIt
var visit_enable : Boolean = false
var experimental_visit_addCurveMesh : Boolean = false
// TODO
var experimental_visit_enable : Boolean = false
/// === constraints and resolutions ===
def update() : Unit = {
......
......@@ -284,6 +284,13 @@ object Platform {
targetCudaCompiler match {
case "NVCC" =>
flags += s" -std=c++11 -O3 -DNDEBUG -lineinfo -arch=sm_${ Platform.hw_cuda_capability }${ Platform.hw_cuda_capabilityMinor }"
// cannot find mpi.h from Globals/Globals.h when compiling with nvcc otherwise
if (Knowledge.mpi_enabled) {
val mpiWrapperFlags = s"$$(shell $resolveCompiler --showme:compile | sed 's/-pthread//g')"
if (!Settings.makefile_additionalCudaFlags.contains(mpiWrapperFlags))
Settings.makefile_additionalCudaFlags += mpiWrapperFlags
}
}
flags
......
......@@ -157,13 +157,14 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
case buf : CUDA_ReductionDeviceData =>
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint
val totalSize : IR_Expression = buf.numPoints * buf.targetDt.getSizeArray.product
if (Knowledge.data_genVariableFieldSizes) {
if (deviceBufferSizes.contains(id))
deviceBufferSizes(id).asInstanceOf[IR_Maximum].args += Duplicate(buf.size)
deviceBufferSizes(id).asInstanceOf[IR_Maximum].args += Duplicate(totalSize)
else
deviceBufferSizes += (id -> IR_Maximum(ListBuffer(Duplicate(buf.size))))
deviceBufferSizes += (id -> IR_Maximum(ListBuffer(Duplicate(totalSize))))
} else {
val size = IR_SimplifyExpression.evalIntegral(buf.size)
val size = IR_SimplifyExpression.evalIntegral(totalSize)
deviceBufferSizes += (id -> (size max deviceBufferSizes.getOrElse(id, IR_IntegerConstant(0)).asInstanceOf[IR_IntegerConstant].v))
}
buf
......@@ -214,7 +215,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, buf.field.index, buf.field.level, buf.neighIdx).prettyprint
val size = deviceBufferSizes(id)
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, IR_RealDatatype /*FIXME*/), IR_ParallelizationInfo(potentiallyParallel = true)))
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, buf.field.resolveBaseDatatype), IR_ParallelizationInfo(potentiallyParallel = true)))
buf
......@@ -222,7 +223,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint
val size = deviceBufferSizes(id)
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, IR_RealDatatype /*FIXME*/), IR_ParallelizationInfo(potentiallyParallel = true)))
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, buf.baseDt), IR_ParallelizationInfo(potentiallyParallel = true)))
buf
......
......@@ -205,6 +205,10 @@ object IR_SimplifyExpression {
res = new mutable.HashMap[IR_Expression, Long]()
res(m) = 1L
case m : IR_MemberFunctionCall =>
res = new mutable.HashMap[IR_Expression, Long]()
res(m) = 1L
case IR_StringLiteral(varName) =>
res = new HashMap[IR_Expression, Long]()
res(IR_VariableAccess(varName, IR_IntegerDatatype)) = 1L // ONLY VariableAccess in res keys, NO StringLiteral
......
......@@ -21,13 +21,20 @@ package exastencils.parallelization.api.cuda
import scala.annotation.tailrec
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import exastencils.base.ir.IR_ImplicitConversion._
import exastencils.base.ir._
import exastencils.baseExt.ir.IR_MatOperations.IR_GenerateBasicMatrixOperations
import exastencils.baseExt.ir._
import exastencils.config.Knowledge
import exastencils.core._
import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.optimization.ir.IR_SimplifyExpression
import exastencils.parallelization.ir.IR_HasParallelizationInfo
import exastencils.solver.ir.IR_InlineMatSolveStmts
import exastencils.util.ir.IR_FctNameCollector
import exastencils.util.ir.IR_StackCollector
/// CUDA_ExtractHostAndDeviceCode
......@@ -35,10 +42,14 @@ import exastencils.util.ir.IR_FctNameCollector
* This transformation is used to convert annotated code into CUDA kernel code.
*/
object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotated CUDA loop in kernel code") {
val collector = new IR_FctNameCollector
this.register(collector)
val fctNameCollector = new IR_FctNameCollector
val stackCollector = new IR_StackCollector
this.register(fctNameCollector)
this.register(stackCollector)
this.onBefore = () => this.resetCollectors()
var enclosingFragmentLoops : mutable.HashMap[IR_ScopedStatement with IR_HasParallelizationInfo, IR_Reduction] = mutable.HashMap()
/**
* Collect all loops in the band.
*
......@@ -73,6 +84,31 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
}
}
this += Transformation("Find reductions with enclosing fragment loops", {
case loop : IR_ForLoop if loop.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
val enclosing = stackCollector.stack.collectFirst {
case fragLoop : IR_LoopOverFragments => fragLoop
case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if name == IR_LoopOverFragments.defIt.name => fragLoop
}
val fragLoopIsSerial = !Knowledge.omp_enabled || (Knowledge.omp_enabled && !Knowledge.omp_parallelizeLoopOverFragments)
if (enclosing.isDefined && fragLoopIsSerial && loop.parallelization.reduction.isDefined)
enclosingFragmentLoops += (enclosing.get -> loop.parallelization.reduction.get)
loop
}, false)
// enclosed by a fragment loop -> create fragment-local copies of the initial value
// and perform reduction after frag loop
this += Transformation("Modify enclosing fragment loops", {
case fragLoop : IR_LoopOverFragments if enclosingFragmentLoops.contains(fragLoop) =>
CUDA_HandleFragmentLoopsWithReduction(fragLoop, enclosingFragmentLoops(fragLoop))
case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if enclosingFragmentLoops.contains(fragLoop) && name == IR_LoopOverFragments.defIt.name =>
CUDA_HandleFragmentLoopsWithReduction(fragLoop, enclosingFragmentLoops(fragLoop))
}, false)
this += new Transformation("Processing ForLoopStatement nodes", {
case loop : IR_ForLoop if loop.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
......@@ -102,10 +138,74 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// add kernel and kernel call
val kernelFunctions = CUDA_KernelFunctions.get
// collect local variable accesses because these variables need to be passed to the kernel at call
CUDA_GatherVariableAccess.clear()
CUDA_GatherVariableAccess.applyStandalone(IR_Scope(loop))
val variableAccesses = CUDA_GatherVariableAccess.accesses.toSeq.sortBy(_._1).map(_._2).to[ListBuffer]
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
else
None
// collect local accesses because their variables need to be passed to the kernel when calling
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
case expr =>
Logger.error("Cannot set submatrix for expression: " + expr)
}
}
// also detect accesses coming from the init of the local target
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(decl))
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(initLocalTarget))
// replace array accesses with accesses to function arguments
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None // actually allow reduction var to be replaced here
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(decl))
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(initLocalTarget))
kernelBody.prepend(initLocalTarget : _*)
kernelBody.prepend(decl)
}
// access collections
val accesses = CUDA_GatherVariableAccesses.evaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
val accessesCopiedToDevice = CUDA_GatherVariableAccesses.nonEvaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
// add non-evaluable accesses in form of pointers to device copies
val deviceArrayCopies = accessesCopiedToDevice.map {
case (k, v) =>
val copyName = CUDA_GatherVariableAccesses.arrayVariableAccessAsString(v._1)
val copyDt = IR_PointerDatatype(v._2.resolveBaseDatatype)
(k, IR_VariableAccess(copyName, copyDt))
}.toMap
// parameters of the kernel
val params = ListBuffer[IR_FunctionArgument]()
params ++= accesses.map { case (name, tup) => IR_FunctionArgument(name, tup._2) }
params ++= deviceArrayCopies.values.map(IR_FunctionArgument(_))
// args passed to kernel
val args = ListBuffer[IR_Expression]()
args ++= accesses.map { case (_, tup) => tup._1 : IR_Expression }
args ++= deviceArrayCopies.values
var extremaMap = mutable.HashMap[String, (Long, Long)]()
......@@ -113,32 +213,78 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
extremaMap = m.asInstanceOf[mutable.HashMap[String, (Long, Long)]]
// inline contained calls to solve functions to avoid separate compilation units
IR_InlineMatSolveStmts.applyStandalone(kernelBody)
IR_InlineMatSolveStmts.applyStandalone(IR_Scope(kernelBody))
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
val kernel = CUDA_Kernel(
kernelFunctions.getIdentifier(collector.getCurrentName),
kernelCount,
kernelFunctions.getIdentifier(fctNameCollector.getCurrentName),
parallelInnerLoops.length,
variableAccesses.map(s => IR_FunctionArgument(s.name, s.datatype)),
params,
Duplicate(loopVariables),
Duplicate(lowerBounds),
Duplicate(upperBounds),
Duplicate(stepSize),
Duplicate(kernelBody),
Duplicate(loop.parallelization.reduction),
Duplicate(reduction),
Duplicate(localTarget),
Duplicate(extremaMap))
kernelFunctions.addKernel(Duplicate(kernel))
// copy array variables from host to device if necessary
if (deviceArrayCopies.nonEmpty) {
deviceArrayCopies foreach { case (k, dstArr) =>
val (srcArr, srcDt) = accessesCopiedToDevice.find(_._1 == k).get._2
deviceStatements += IR_VariableDeclaration(dstArr)
deviceStatements += CUDA_Allocate(dstArr, srcDt.getSizeArray.product, srcDt.resolveBaseDatatype)
deviceStatements += CUDA_Memcpy(dstArr, srcArr, srcDt.typicalByteSize, "cudaMemcpyHostToDevice")
}
}
// process return value of kernel wrapper call if reduction is required
if (loop.parallelization.reduction.isDefined) {
val red = loop.parallelization.reduction.get
deviceStatements += IR_Assignment(red.target,
IR_BinaryOperators.createExpression(red.op, red.target,
IR_FunctionCall(kernel.getWrapperFctName, variableAccesses.map(_.asInstanceOf[IR_Expression]))))
val callKernel = IR_FunctionCall(kernel.getWrapperFctName, args)
if (reduction.isDefined) {
val red = Duplicate(reduction.get)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
reductionDt match {
case mat : IR_MatrixDatatype =>
val baseDt = mat.resolveBaseDatatype
// declare and allocate tmp buffer for matrix reduction
val reductionTmp = IR_VariableAccess("reductionTmpMatrix", IR_PointerDatatype(baseDt))
deviceStatements += IR_VariableDeclaration(reductionTmp)
deviceStatements += IR_ArrayAllocation(reductionTmp, baseDt, mat.sizeN * mat.sizeM)
// call kernel and pass allocated tmp buffer by pointer
callKernel.arguments += reductionTmp
deviceStatements += callKernel
// update reduction target
deviceStatements += IR_GenerateBasicMatrixOperations.loopCompoundAssignSubmatrixPointer(
reductionTmp, mat.sizeN, red.target, 0, 0, mat.sizeM, mat.sizeN, red.op)
// free allocated buffer
deviceStatements += IR_ArrayFree(reductionTmp)
case _ : IR_ScalarDatatype =>
deviceStatements += IR_Assignment(red.target, IR_BinaryOperators.createExpression(red.op, red.target, callKernel))
}
} else {
deviceStatements += IR_FunctionCall(kernel.getWrapperFctName, variableAccesses.map(_.asInstanceOf[IR_Expression]))
deviceStatements += callKernel
}
// destroy device copies
if (deviceArrayCopies.nonEmpty)
deviceStatements ++= deviceArrayCopies.keys.map(CUDA_Free(_))
deviceStatements
}, false)
}
//=============================================================================
//
// This file is part of the ExaStencils code generation framework. ExaStencils
// is free software: you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// ExaStencils is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with ExaStencils. If not, see <http://www.gnu.org/licenses/>.
//
//=============================================================================
package exastencils.parallelization.api.cuda
import scala.collection.mutable
import exastencils.base.ir._
import exastencils.datastructures._
object CUDA_GatherVariableAccess extends QuietDefaultStrategy("Gather local VariableAccess nodes") {
var accesses = mutable.HashMap[String, IR_VariableAccess]()
var ignoredAccesses = mutable.SortedSet[String]()
def clear() = {
accesses = mutable.HashMap[String, IR_VariableAccess]()
ignoredAccesses += "std::cout"
ignoredAccesses += "std::cerr"
ignoredAccesses += "std::endl"
}
this += new Transformation("Searching", {
case decl : IR_VariableDeclaration =>
ignoredAccesses += decl.name
decl
case access : IR_VariableAccess if !ignoredAccesses.contains(access.name) =>
accesses.put(access.name, access)
access
})
}
//=============================================================================
//
// This file is part of the ExaStencils code generation framework. ExaStencils
// is free software: you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// ExaStencils is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with ExaStencils. If not, see <http://www.gnu.org/licenses/>.
//
//=============================================================================
package exastencils.parallelization.api.cuda
import scala.collection.mutable
import exastencils.base.ir._
import exastencils.baseExt.ir.IR_LoopOverFragments
import exastencils.config.Knowledge
import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.optimization.ir.EvaluationException
import exastencils.optimization.ir.IR_SimplifyExpression
import exastencils.parallelization.api.cuda.CUDA_Util._
object CUDA_GatherVariableAccesses extends QuietDefaultStrategy("Gather local VariableAccess nodes") {
var reductionTarget : Option[IR_Expression] = None
var kernelCount : Int = 0
var evaluableAccesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
var nonEvaluableAccesses = mutable.HashMap[String, (IR_VariableAccess, IR_Datatype)]()
var ignoredAccesses = mutable.SortedSet[String]()
var ignoredArrayVariableAccesses = mutable.SortedSet[String]()
def basePrefix(base : IR_VariableAccess) = base.name
// regular, evaluable indexed array accesses
def arrayAccessAsString(base : IR_VariableAccess, idx : IR_Expression) = basePrefix(base) + idx.prettyprint()
def containsArrayAccess(base : IR_VariableAccess, idx : IR_Expression) = evaluableAccesses.contains(arrayAccessAsString(base, idx))
// array variable accesses in case that a kernel is passed whole array as argument (for non-evaluable indices)
def arrayVariableAccessAsString(base : IR_VariableAccess) = s"${basePrefix(base)}_deviceCopy_$kernelCount"
def containsArrayVariableAccess(base : IR_VariableAccess) = nonEvaluableAccesses.contains(arrayVariableAccessAsString(base))
def isReplaceable(base : IR_VariableAccess, idx : IR_Expression) =
containsArrayAccess(base, idx) || containsArrayVariableAccess(base)
def replaceAccess(base : IR_VariableAccess, idx : IR_Expression) : Option[IR_Expression] = {
if (isReplaceable(base, idx)) {
if (containsArrayAccess(base, idx)) {
val name = arrayAccessAsString(base, idx)
Some(IR_VariableAccess(name, evaluableAccesses(name)._2))
} else if (containsArrayVariableAccess(base)) {
val name = arrayVariableAccessAsString(base)
Some(IR_ArrayAccess(IR_VariableAccess(name, base.datatype), idx))
} else {
Logger.error("Error while gathering variables for CUDA kernels")
}
} else {
None
}
}
def isEvaluable(idx : IR_Expression) = {
var ret = true
try {
IR_SimplifyExpression.evalIntegral(idx)
} catch {
case _ : EvaluationException => ret = false
case _ : MatchError => ret = false
}
ret
}
val fragIdx = IR_LoopOverFragments.defIt
def clear() = {
reductionTarget = None
evaluableAccesses = mutable.HashMap[String, (IR_Access, IR_Datatype)]()
nonEvaluableAccesses = mutable.HashMap[String, (IR_VariableAccess, IR_Datatype)]()
ignoredArrayVariableAccesses = mutable.SortedSet[String]()
ignoredAccesses = mutable.SortedSet[String]()
ignoredAccesses += "std::cout"
ignoredAccesses += "std::cerr"
ignoredAccesses += "std::endl"
}
this += new Transformation("Searching", {
case decl : IR_VariableDeclaration =>
ignoredAccesses += decl.name
decl
case arrAcc @ IR_ArrayAccess(base : IR_VariableAccess, idx, _) if !ignoredAccesses.contains(base.name) =>
ignoredArrayVariableAccesses += base.name
if (isEvaluable(idx)) {
// single, evaluable array accesses -> count "base[idx]" as variable access
evaluableAccesses.put(arrayAccessAsString(base, idx), (arrAcc, base.datatype.resolveBaseDatatype))
} else {
// we found a non-evaluable index -> remove previous evaluable accesses
evaluableAccesses.foreach {
case (k, _) if k.startsWith(basePrefix(base)) && k.length > basePrefix(base).length => evaluableAccesses.remove(k)