Commit c2c43e32 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Merge remote-tracking branch 'origin/master' into devel/matrix_init_fix

parents d9d5039b e5f5e5c9
This diff is collapsed.
...@@ -52,7 +52,9 @@ import exastencils.stencil.ir._ ...@@ -52,7 +52,9 @@ import exastencils.stencil.ir._
import exastencils.timing.ir._ import exastencils.timing.ir._
import exastencils.util._ import exastencils.util._
import exastencils.util.ir._ import exastencils.util.ir._
import exastencils.visualization.ir._ import exastencils.visualization.ir.cimg.IR_ResolveCImgFunctions
import exastencils.visualization.ir.visit.IR_SetupVisit
import exastencils.visualization.ir.vtk.IR_ResolveVtkPrinters
/// IR_LayerHandler /// IR_LayerHandler
...@@ -169,6 +171,9 @@ object IR_DefaultLayerHandler extends IR_LayerHandler { ...@@ -169,6 +171,9 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
// simplify indices modified just now, otherwise equality checks will not work later on // simplify indices modified just now, otherwise equality checks will not work later on
IR_GeneralSimplify.apply() IR_GeneralSimplify.apply()
if (Knowledge.visit_enable)
IR_SetupVisit.apply()
var convChanged = false var convChanged = false
do { do {
IR_FindStencilConvolutions.changed = false IR_FindStencilConvolutions.changed = false
...@@ -185,9 +190,6 @@ object IR_DefaultLayerHandler extends IR_LayerHandler { ...@@ -185,9 +190,6 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
IR_ResolveStencilFunction.apply() IR_ResolveStencilFunction.apply()
if (Knowledge.experimental_visit_enable)
IR_SetupVisit.apply()
// resolve new virtual field accesses // resolve new virtual field accesses
IR_ResolveIntegrateOnGrid.apply() IR_ResolveIntegrateOnGrid.apply()
IR_ResolveEvaluateOnGrid.apply() IR_ResolveEvaluateOnGrid.apply()
......
...@@ -21,7 +21,8 @@ package exastencils.applications.ir ...@@ -21,7 +21,8 @@ package exastencils.applications.ir
import scala.collection.mutable.ListBuffer import scala.collection.mutable.ListBuffer
import exastencils.base.ir.IR_ImplicitConversion._ import exastencils.base.ir.IR_ImplicitConversion._
import exastencils.base.ir.{ IR_Native, _ } import exastencils.base.ir.IR_Native
import exastencils.base.ir._
import exastencils.config.Knowledge import exastencils.config.Knowledge
import exastencils.datastructures._ import exastencils.datastructures._
import exastencils.logger.Logger import exastencils.logger.Logger
......
...@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain ...@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import exastencils.field.ir._ import exastencils.field.ir._
import exastencils.parallelization.api.mpi._ import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkQuads import exastencils.visualization.ir.vtk.IR_PrintVtkQuads
/// IR_PrintVtkNNF /// IR_PrintVtkNNF
......
...@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain ...@@ -29,7 +29,7 @@ import exastencils.domain.ir.IR_IV_IsValidForDomain
import exastencils.field.ir._ import exastencils.field.ir._
import exastencils.parallelization.api.mpi._ import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkQuads import exastencils.visualization.ir.vtk.IR_PrintVtkQuads
/// IR_PrintVtkNS /// IR_PrintVtkNS
......
...@@ -32,7 +32,7 @@ import exastencils.grid.ir.IR_AtNode ...@@ -32,7 +32,7 @@ import exastencils.grid.ir.IR_AtNode
import exastencils.logger.Logger import exastencils.logger.Logger
import exastencils.parallelization.api.mpi._ import exastencils.parallelization.api.mpi._
import exastencils.util.ir.IR_Print import exastencils.util.ir.IR_Print
import exastencils.visualization.ir.IR_PrintVtkTriangles import exastencils.visualization.ir.vtk.IR_PrintVtkTriangles
/// IR_PrintVtkSWE /// IR_PrintVtkSWE
......
...@@ -311,6 +311,20 @@ object IR_GenerateBasicMatrixOperations { ...@@ -311,6 +311,20 @@ object IR_GenerateBasicMatrixOperations {
stmts stmts
} }
// copy a submatrix of n_rows x n_cols to 'copy' from position 'offset_r', 'offset_c' in 'source' with size 'sourcesize'
def loopCompoundAssignSubmatrixPointer(source : IR_Expression, sourcesize : IR_Expression, dest : IR_Expression, offset_r : IR_Expression, offset_c : IR_Expression, n_rows : IR_Expression, n_cols : IR_Expression, op : String) : IR_Scope = {
var stmts = IR_Scope(Nil)
var i = IR_VariableAccess("i", IR_IntegerDatatype)
var j = IR_VariableAccess("j", IR_IntegerDatatype)
stmts.body += IR_ForLoop(IR_VariableDeclaration(i, offset_r), IR_Lower(i, n_rows + offset_r), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, offset_c), IR_Lower(j, offset_c + n_cols), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(IR_ArrayAccess(dest, (i - offset_r) * n_cols + j - offset_c),
IR_BinaryOperators.createExpression(op, IR_ArrayAccess(dest, (i - offset_r) * n_cols + j - offset_c), IR_ArrayAccess(source, i * sourcesize + j)))
))
))
stmts
}
// write a submatrix 'source' of n_rows x n_cols to 'destination' at position 'offset_r', 'offset_c' // write a submatrix 'source' of n_rows x n_cols to 'destination' at position 'offset_r', 'offset_c'
def loopSetSubmatrixMat(source : IR_Expression, destination : IR_Expression, rows_source : IR_Expression, cols_source : IR_Expression, offset_r : IR_Expression, offset_c : IR_Expression) : IR_Scope = { def loopSetSubmatrixMat(source : IR_Expression, destination : IR_Expression, rows_source : IR_Expression, cols_source : IR_Expression, offset_r : IR_Expression, offset_c : IR_Expression) : IR_Scope = {
if (!isScalar(offset_r) || !isScalar(offset_c)) if (!isScalar(offset_r) || !isScalar(offset_c))
......
...@@ -668,10 +668,10 @@ object Knowledge { ...@@ -668,10 +668,10 @@ object Knowledge {
var experimental_grid_randomMaxOffset : Double = 0.1 var experimental_grid_randomMaxOffset : Double = 0.1
/// student project - Richard / visit // in-situ visualization with VisIt
var visit_enable : Boolean = false
var experimental_visit_addCurveMesh : Boolean = false
// TODO
var experimental_visit_enable : Boolean = false
/// === constraints and resolutions === /// === constraints and resolutions ===
def update() : Unit = { def update() : Unit = {
......
...@@ -284,6 +284,13 @@ object Platform { ...@@ -284,6 +284,13 @@ object Platform {
targetCudaCompiler match { targetCudaCompiler match {
case "NVCC" => case "NVCC" =>
flags += s" -std=c++11 -O3 -DNDEBUG -lineinfo -arch=sm_${ Platform.hw_cuda_capability }${ Platform.hw_cuda_capabilityMinor }" flags += s" -std=c++11 -O3 -DNDEBUG -lineinfo -arch=sm_${ Platform.hw_cuda_capability }${ Platform.hw_cuda_capabilityMinor }"
// cannot find mpi.h from Globals/Globals.h when compiling with nvcc otherwise
if (Knowledge.mpi_enabled) {
val mpiWrapperFlags = s"$$(shell $resolveCompiler --showme:compile | sed 's/-pthread//g')"
if (!Settings.makefile_additionalCudaFlags.contains(mpiWrapperFlags))
Settings.makefile_additionalCudaFlags += mpiWrapperFlags
}
} }
flags flags
......
...@@ -157,13 +157,14 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables") ...@@ -157,13 +157,14 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
case buf : CUDA_ReductionDeviceData => case buf : CUDA_ReductionDeviceData =>
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint
val totalSize : IR_Expression = buf.numPoints * buf.targetDt.getSizeArray.product
if (Knowledge.data_genVariableFieldSizes) { if (Knowledge.data_genVariableFieldSizes) {
if (deviceBufferSizes.contains(id)) if (deviceBufferSizes.contains(id))
deviceBufferSizes(id).asInstanceOf[IR_Maximum].args += Duplicate(buf.size) deviceBufferSizes(id).asInstanceOf[IR_Maximum].args += Duplicate(totalSize)
else else
deviceBufferSizes += (id -> IR_Maximum(ListBuffer(Duplicate(buf.size)))) deviceBufferSizes += (id -> IR_Maximum(ListBuffer(Duplicate(totalSize))))
} else { } else {
val size = IR_SimplifyExpression.evalIntegral(buf.size) val size = IR_SimplifyExpression.evalIntegral(totalSize)
deviceBufferSizes += (id -> (size max deviceBufferSizes.getOrElse(id, IR_IntegerConstant(0)).asInstanceOf[IR_IntegerConstant].v)) deviceBufferSizes += (id -> (size max deviceBufferSizes.getOrElse(id, IR_IntegerConstant(0)).asInstanceOf[IR_IntegerConstant].v))
} }
buf buf
...@@ -214,7 +215,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables") ...@@ -214,7 +215,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, buf.field.index, buf.field.level, buf.neighIdx).prettyprint val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, buf.field.index, buf.field.level, buf.neighIdx).prettyprint
val size = deviceBufferSizes(id) val size = deviceBufferSizes(id)
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, IR_RealDatatype /*FIXME*/), IR_ParallelizationInfo(potentiallyParallel = true))) deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, buf.field.resolveBaseDatatype), IR_ParallelizationInfo(potentiallyParallel = true)))
buf buf
...@@ -222,7 +223,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables") ...@@ -222,7 +223,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint val id = buf.resolveAccess(buf.resolveName(), IR_LoopOverFragments.defIt, IR_NullExpression, IR_NullExpression, IR_NullExpression, IR_NullExpression).prettyprint
val size = deviceBufferSizes(id) val size = deviceBufferSizes(id)
deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, IR_RealDatatype /*FIXME*/), IR_ParallelizationInfo(potentiallyParallel = true))) deviceBufferAllocs += (id -> IR_LoopOverFragments(CUDA_Allocate(buf, size, buf.baseDt), IR_ParallelizationInfo(potentiallyParallel = true)))
buf buf
......
...@@ -205,6 +205,10 @@ object IR_SimplifyExpression { ...@@ -205,6 +205,10 @@ object IR_SimplifyExpression {
res = new mutable.HashMap[IR_Expression, Long]() res = new mutable.HashMap[IR_Expression, Long]()
res(m) = 1L res(m) = 1L
case m : IR_MemberFunctionCall =>
res = new mutable.HashMap[IR_Expression, Long]()
res(m) = 1L
case IR_StringLiteral(varName) => case IR_StringLiteral(varName) =>
res = new HashMap[IR_Expression, Long]() res = new HashMap[IR_Expression, Long]()
res(IR_VariableAccess(varName, IR_IntegerDatatype)) = 1L // ONLY VariableAccess in res keys, NO StringLiteral res(IR_VariableAccess(varName, IR_IntegerDatatype)) = 1L // ONLY VariableAccess in res keys, NO StringLiteral
......
...@@ -21,13 +21,20 @@ package exastencils.parallelization.api.cuda ...@@ -21,13 +21,20 @@ package exastencils.parallelization.api.cuda
import scala.annotation.tailrec import scala.annotation.tailrec
import scala.collection.mutable import scala.collection.mutable
import scala.collection.mutable.ListBuffer import scala.collection.mutable.ListBuffer
import exastencils.base.ir.IR_ImplicitConversion._ import exastencils.base.ir.IR_ImplicitConversion._
import exastencils.base.ir._ import exastencils.base.ir._
import exastencils.baseExt.ir.IR_MatOperations.IR_GenerateBasicMatrixOperations
import exastencils.baseExt.ir._
import exastencils.config.Knowledge
import exastencils.core._ import exastencils.core._
import exastencils.datastructures._ import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.optimization.ir.IR_SimplifyExpression import exastencils.optimization.ir.IR_SimplifyExpression
import exastencils.parallelization.ir.IR_HasParallelizationInfo
import exastencils.solver.ir.IR_InlineMatSolveStmts import exastencils.solver.ir.IR_InlineMatSolveStmts
import exastencils.util.ir.IR_FctNameCollector import exastencils.util.ir.IR_FctNameCollector
import exastencils.util.ir.IR_StackCollector
/// CUDA_ExtractHostAndDeviceCode /// CUDA_ExtractHostAndDeviceCode
...@@ -35,10 +42,14 @@ import exastencils.util.ir.IR_FctNameCollector ...@@ -35,10 +42,14 @@ import exastencils.util.ir.IR_FctNameCollector
* This transformation is used to convert annotated code into CUDA kernel code. * This transformation is used to convert annotated code into CUDA kernel code.
*/ */
object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotated CUDA loop in kernel code") { object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotated CUDA loop in kernel code") {
val collector = new IR_FctNameCollector val fctNameCollector = new IR_FctNameCollector
this.register(collector) val stackCollector = new IR_StackCollector
this.register(fctNameCollector)
this.register(stackCollector)
this.onBefore = () => this.resetCollectors() this.onBefore = () => this.resetCollectors()
var enclosingFragmentLoops : mutable.HashMap[IR_ScopedStatement with IR_HasParallelizationInfo, IR_Reduction] = mutable.HashMap()
/** /**
* Collect all loops in the band. * Collect all loops in the band.
* *
...@@ -73,6 +84,31 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -73,6 +84,31 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
} }
} }
this += Transformation("Find reductions with enclosing fragment loops", {
case loop : IR_ForLoop if loop.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
val enclosing = stackCollector.stack.collectFirst {
case fragLoop : IR_LoopOverFragments => fragLoop
case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if name == IR_LoopOverFragments.defIt.name => fragLoop
}
val fragLoopIsSerial = !Knowledge.omp_enabled || (Knowledge.omp_enabled && !Knowledge.omp_parallelizeLoopOverFragments)
if (enclosing.isDefined && fragLoopIsSerial && loop.parallelization.reduction.isDefined)
enclosingFragmentLoops += (enclosing.get -> loop.parallelization.reduction.get)
loop
}, false)
// enclosed by a fragment loop -> create fragment-local copies of the initial value
// and perform reduction after frag loop
this += Transformation("Modify enclosing fragment loops", {
case fragLoop : IR_LoopOverFragments if enclosingFragmentLoops.contains(fragLoop) =>
CUDA_HandleFragmentLoopsWithReduction(fragLoop, enclosingFragmentLoops(fragLoop))
case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if enclosingFragmentLoops.contains(fragLoop) && name == IR_LoopOverFragments.defIt.name =>
CUDA_HandleFragmentLoopsWithReduction(fragLoop, enclosingFragmentLoops(fragLoop))
}, false)
this += new Transformation("Processing ForLoopStatement nodes", { this += new Transformation("Processing ForLoopStatement nodes", {
case loop : IR_ForLoop if loop.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) && case loop : IR_ForLoop if loop.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) => loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
...@@ -102,10 +138,74 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -102,10 +138,74 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// add kernel and kernel call // add kernel and kernel call
val kernelFunctions = CUDA_KernelFunctions.get val kernelFunctions = CUDA_KernelFunctions.get
// collect local variable accesses because these variables need to be passed to the kernel at call val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
CUDA_GatherVariableAccess.clear()
CUDA_GatherVariableAccess.applyStandalone(IR_Scope(loop)) val reduction = loop.parallelization.reduction
val variableAccesses = CUDA_GatherVariableAccess.accesses.toSeq.sortBy(_._1).map(_._2).to[ListBuffer]
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
else
None
// collect local accesses because their variables need to be passed to the kernel when calling
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
case expr =>
Logger.error("Cannot set submatrix for expression: " + expr)
}
}
// also detect accesses coming from the init of the local target
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(decl))
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(initLocalTarget))
// replace array accesses with accesses to function arguments
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None // actually allow reduction var to be replaced here
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(decl))
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(initLocalTarget))
kernelBody.prepend(initLocalTarget : _*)
kernelBody.prepend(decl)
}
// access collections
val accesses = CUDA_GatherVariableAccesses.evaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
val accessesCopiedToDevice = CUDA_GatherVariableAccesses.nonEvaluableAccesses.toSeq.sortBy(_._1).to[ListBuffer]
// add non-evaluable accesses in form of pointers to device copies
val deviceArrayCopies = accessesCopiedToDevice.map {
case (k, v) =>
val copyName = CUDA_GatherVariableAccesses.arrayVariableAccessAsString(v._1)
val copyDt = IR_PointerDatatype(v._2.resolveBaseDatatype)
(k, IR_VariableAccess(copyName, copyDt))
}.toMap
// parameters of the kernel
val params = ListBuffer[IR_FunctionArgument]()
params ++= accesses.map { case (name, tup) => IR_FunctionArgument(name, tup._2) }
params ++= deviceArrayCopies.values.map(IR_FunctionArgument(_))
// args passed to kernel
val args = ListBuffer[IR_Expression]()
args ++= accesses.map { case (_, tup) => tup._1 : IR_Expression }
args ++= deviceArrayCopies.values
var extremaMap = mutable.HashMap[String, (Long, Long)]() var extremaMap = mutable.HashMap[String, (Long, Long)]()
...@@ -113,32 +213,78 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate ...@@ -113,32 +213,78 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
extremaMap = m.asInstanceOf[mutable.HashMap[String, (Long, Long)]] extremaMap = m.asInstanceOf[mutable.HashMap[String, (Long, Long)]]
// inline contained calls to solve functions to avoid separate compilation units // inline contained calls to solve functions to avoid separate compilation units
IR_InlineMatSolveStmts.applyStandalone(kernelBody) IR_InlineMatSolveStmts.applyStandalone(IR_Scope(kernelBody))
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
val kernel = CUDA_Kernel( val kernel = CUDA_Kernel(
kernelFunctions.getIdentifier(collector.getCurrentName), kernelCount,
kernelFunctions.getIdentifier(fctNameCollector.getCurrentName),
parallelInnerLoops.length, parallelInnerLoops.length,
variableAccesses.map(s => IR_FunctionArgument(s.name, s.datatype)), params,
Duplicate(loopVariables), Duplicate(loopVariables),
Duplicate(lowerBounds), Duplicate(lowerBounds),
Duplicate(upperBounds), Duplicate(upperBounds),
Duplicate(stepSize), Duplicate(stepSize),
Duplicate(kernelBody), Duplicate(kernelBody),
Duplicate(loop.parallelization.reduction), Duplicate(reduction),
Duplicate(localTarget),
Duplicate(extremaMap)) Duplicate(extremaMap))
kernelFunctions.addKernel(Duplicate(kernel)) kernelFunctions.addKernel(Duplicate(kernel))
// copy array variables from host to device if necessary
if (deviceArrayCopies.nonEmpty) {
deviceArrayCopies foreach { case (k, dstArr) =>
val (srcArr, srcDt) = accessesCopiedToDevice.find(_._1 == k).get._2
deviceStatements += IR_VariableDeclaration(dstArr)
deviceStatements += CUDA_Allocate(dstArr, srcDt.getSizeArray.product, srcDt.resolveBaseDatatype)
deviceStatements += CUDA_Memcpy(dstArr, srcArr, srcDt.typicalByteSize, "cudaMemcpyHostToDevice")
}
}
// process return value of kernel wrapper call if reduction is required // process return value of kernel wrapper call if reduction is required
if (loop.parallelization.reduction.isDefined) { val callKernel = IR_FunctionCall(kernel.getWrapperFctName, args)
val red = loop.parallelization.reduction.get if (reduction.isDefined) {
deviceStatements += IR_Assignment(red.target, val red = Duplicate(reduction.get)
IR_BinaryOperators.createExpression(red.op, red.target, val redTarget = Duplicate(red.target)
IR_FunctionCall(kernel.getWrapperFctName, variableAccesses.map(_.asInstanceOf[IR_Expression])))) val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
reductionDt match {
case mat : IR_MatrixDatatype =>
val baseDt = mat.resolveBaseDatatype
// declare and allocate tmp buffer for matrix reduction
val reductionTmp = IR_VariableAccess("reductionTmpMatrix", IR_PointerDatatype(baseDt))
deviceStatements += IR_VariableDeclaration(reductionTmp)
deviceStatements += IR_ArrayAllocation(reductionTmp, baseDt, mat.sizeN * mat.sizeM)
// call kernel and pass allocated tmp buffer by pointer
callKernel.arguments += reductionTmp
deviceStatements += callKernel
// update reduction target
deviceStatements += IR_GenerateBasicMatrixOperations.loopCompoundAssignSubmatrixPointer(
reductionTmp, mat.sizeN, red.target, 0, 0, mat.sizeM, mat.sizeN, red.op)
// free allocated buffer
deviceStatements += IR_ArrayFree(reductionTmp)
case _ : IR_ScalarDatatype =>
deviceStatements += IR_Assignment(red.target, IR_BinaryOperators.createExpression(red.op, red.target, callKernel))
}
} else { } else {
deviceStatements += IR_FunctionCall(kernel.getWrapperFctName, variableAccesses.map(_.asInstanceOf[IR_Expression])) deviceStatements += callKernel
} }
// destroy device copies
if (deviceArrayCopies.nonEmpty)
deviceStatements ++= deviceArrayCopies.keys.map(CUDA_Free(_))
deviceStatements deviceStatements
}, false) }, false)
} }
//=============================================================================
//
// This file is part of the ExaStencils code generation framework. ExaStencils
// is free software: you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation, either version 3 of the License, or (at your option) any later
// version.
//
// ExaStencils is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along
// with ExaStencils. If not, see <http://www.gnu.org/licenses/>.
//
//=============================================================================
package exastencils.parallelization.api.cuda
import scala.collection.mutable
import exastencils.base.ir._
import exastencils.datastructures._
object CUDA_GatherVariableAccess extends QuietDefaultStrategy("Gather local VariableAccess nodes") {
var accesses = mutable.HashMap[String, IR_VariableAccess]()
var ignoredAccesses = mutable.SortedSet[String]()
def clear() = {
accesses = mutable.HashMap[String, IR_VariableAccess]()
ignoredAccesses += "std::cout"
ignoredAccesses += "std::cerr"
ignoredAccesses += "std::endl"
}
this += new Transformation("Searching", {
case decl : IR_VariableDeclaration =>
ignoredAccesses += decl.name
decl
case access : IR_VariableAccess if !ignoredAccesses.contains(access.name) =>
accesses.put(access.name, access)