Commit 4c96a571 authored by Sara Faghih-Naini's avatar Sara Faghih-Naini
Browse files

Merge branch 'devel/fix_cuda_nodePos_comm' into 'master'

devel/fix_cuda_nodePos_comm

See merge request exastencils/exastencils!72
parents 7e484447 8d4071ca
......@@ -52,6 +52,7 @@ stages:
- python3 --version
- mpirun --version
- nvcc --version
- nvidia-smi
- updatedb
- locate cuda.h
- locate libcudart
......
......@@ -325,10 +325,18 @@ object IR_DefaultLayerHandler extends IR_LayerHandler {
if (Knowledge.data_genVariableFieldSizes)
IR_GenerateIndexManipFcts.apply()
// adapt accesses to device data in case of managed memory
if (Knowledge.cuda_enabled && Knowledge.cuda_useManagedMemory)
CUDA_AdaptDeviceAccessesForMM.apply()
IR_AddInternalVariables.apply()
// resolve possibly newly added constant IVs
IR_ResolveConstIVs.apply()
// adapt allocations and de-allocations before expanding
if (Knowledge.cuda_enabled)
CUDA_AdaptAllocations.apply()
if (Knowledge.useFasterExpand)
IR_ExpandInOnePass.apply()
else
......
......@@ -26,7 +26,6 @@ import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.datastructures._
import exastencils.logger.Logger
import exastencils.parallelization.api.cuda._
import exastencils.parallelization.api.mpi._
import exastencils.parallelization.api.omp.OMP_Parallel
......@@ -51,11 +50,6 @@ object IR_HandleMainApplication extends DefaultStrategy("HandleMainApplication")
func.body.append(IR_Native("LIKWID_MARKER_CLOSE"))
}
if (Knowledge.cuda_enabled) {
func.body.prepend(CUDA_Init)
func.body.append(CUDA_Finalize)
}
if (Knowledge.mpi_enabled) {
func.body.prepend(MPI_Init)
func.body.append(MPI_Finalize)
......
......@@ -38,7 +38,7 @@ case class IR_ArrayAllocation(var name : IR_Expression, // no string - could be
case class IR_ScalarAllocation(var name : IR_Expression, // no string - could be an IV
var datatype : IR_Datatype,
) extends IR_Statement {
) extends IR_Statement {
//override def datatype = IR_UnitDatatype
override def prettyprint(out : PpStream) : Unit = out << name << " = " << "new" << ' ' << datatype << ";"
}
......@@ -49,7 +49,6 @@ case class IR_ArrayFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete[] " << pointer << ";"
}
/// IR_ScalarFree
case class IR_ScalarFree(var pointer : IR_Expression) extends IR_Statement {
override def prettyprint(out : PpStream) : Unit = out << "delete " << pointer << ";"
......
......@@ -515,6 +515,16 @@ object Knowledge {
// apply spatial blocking with read-only cache
var cuda_spatialBlockingWithROC : Boolean = false
// use pinned memory to allocate host field data and buffers
var cuda_usePinnedHostMemory : Boolean = true
// use managed memory instead of host and device variants for field data and buffers
var cuda_useManagedMemory : Boolean = false
// replace device variants of field data and buffers with device pointers derived from host counter-parts
var cuda_useZeroCopy : Boolean = false
// only relevant if cuda_useManagedMemory == true; replace cuda memcpy with asynchronous prefetches
var cuda_genAsyncPrefetch : Boolean = true
// if true, the first dimension of the block size is enlarged if the kernel dimensionality is lower than the global dimensionality
var cuda_foldBlockSizeForRedDimensionality : Boolean = true
......@@ -647,9 +657,6 @@ object Knowledge {
var experimental_CTPivotElimination : Boolean = false
var experimental_QRPivot : Boolean = false
// eliminate occurrences of cudaContext - required for PizDaint
var experimental_eliminateCudaContext : Boolean = false
// generate cuda kernels independently of them being parallel or not
var experimental_cuda_generateKernelForNonParallel : Boolean = false
......@@ -672,7 +679,6 @@ object Knowledge {
var visit_enable : Boolean = false
var experimental_visit_addCurveMesh : Boolean = false
/// === constraints and resolutions ===
def update() : Unit = {
// NOTE: it is required to call update at least once
......@@ -774,7 +780,13 @@ object Knowledge {
Constraints.condWarn(cuda_enabled && opt_conventionalCSE && !useDblPrecision, "Double precision should be used if CUDA is enabled and CSE should be applied!")
Constraints.condEnsureValue(useDblPrecision, true, cuda_enabled && opt_conventionalCSE)
Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(",") )
Constraints.condError(!cuda_memory_transfer_elimination_options.contains(cuda_eliminate_memory_transfers), "Invalid value for \"cuda_eliminate_memory_transfers\". Should be one of: " + cuda_memory_transfer_elimination_options.mkString(","))
Constraints.condEnsureValue(cuda_usePinnedHostMemory, true, cuda_useZeroCopy)
Constraints.condError(cuda_useManagedMemory && cuda_usePinnedHostMemory, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condError(cuda_useManagedMemory && cuda_useZeroCopy, "cuda_useManagedMemory and cuda_usePinnedHostMemory are mutually exclusive")
Constraints.condEnsureValue(data_alignFieldPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condEnsureValue(data_alignTmpBufferPointers, false, cuda_useManagedMemory || cuda_usePinnedHostMemory)
Constraints.condWarn(experimental_splitLoopsForAsyncComm && !comm_onlyAxisNeighbors, s"Using asynchronous communication with comm_onlyAxisNeighbors leads to problems with stencils containing diagonal entries")
......
......@@ -203,7 +203,7 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
bufferAllocs += (id -> IR_LoopOverFragments(
IR_ArrayAllocation(
buf,
if(buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype
if (buf.field.layout.datatype.isInstanceOf[IR_ComplexDatatype]) buf.field.layout.datatype
else IR_RealDatatype,
size
), IR_ParallelizationInfo(potentiallyParallel = true)))
......@@ -259,9 +259,11 @@ object IR_AddInternalVariables extends DefaultStrategy("Add internal variables")
func.body += genericAlloc._2
for (deviceAlloc <- deviceFieldAllocs.toSeq.sortBy(_._1) ++ deviceBufferAllocs.toSeq.sortBy(_._1))
if ("Condition" == Knowledge.cuda_preferredExecution)
func.body += IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), deviceAlloc._2)
else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481
if ("Condition" == Knowledge.cuda_preferredExecution) {
val loop = deviceAlloc._2.asInstanceOf[IR_LoopOverFragments]
loop.body = ListBuffer(IR_IfCondition(IR_Negation(Knowledge.cuda_executionCondition), loop.body))
func.body += loop
} else if ("MSVC" == Platform.targetCompiler /*&& Platform.targetCompilerVersion <= 11*/ ) // fix for https://support.microsoft.com/en-us/kb/315481
func.body += IR_Scope(deviceAlloc._2)
else
func.body += deviceAlloc._2
......
......@@ -79,6 +79,7 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
case ex : VectorizationException =>
if (DEBUG) {
val msg : String = "[vect] unable to vectorize loop: " + ex.msg + " (line " + ex.getStackTrace()(0).getLineNumber + ')'
Logger.warn(msg)
println(msg) // print directly, logger may be silenced by any surrounding strategy
return List(IR_Comment(msg), node)
}
......@@ -144,6 +145,8 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private var alignedResidue : Long = -1
private val nameTempl : String = "_vec%02d"
private var reductionVarArrayAccesses : Option[IR_ArrayAccess] = None
// init
pushScope()
......@@ -241,6 +244,14 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
def getAlignedResidue() : Long = {
alignedResidue
}
def setReductionArrayAccess(arrAcc : IR_ArrayAccess) = {
reductionVarArrayAccesses = Some(arrAcc)
}
def getReductionArrayAccess() = {
reductionVarArrayAccesses
}
}
private def containsVarAcc(node : IR_Node, varName : String) : Boolean = {
......@@ -262,8 +273,12 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
val ctx = new LoopCtx(itVar, incr)
var postLoopStmt : IR_Statement = null
if (reduction.isDefined) {
val target = reduction.get.target
val target = Duplicate(reduction.get.target)
val operator = reduction.get.op
target match {
case arrAcc : IR_ArrayAccess => ctx.setReductionArrayAccess(arrAcc)
case _ =>
}
val (vecTmp : String, true) = ctx.getName(target)
val identityElem : IR_Expression =
......@@ -602,6 +617,11 @@ private object VectorizeInnermost extends PartialFunction[Node, Transformation.O
private def vectorizeExpr(expr : IR_Expression, ctx : LoopCtx) : IR_Expression = {
expr match {
case arrAcc : IR_ArrayAccess if ctx.getReductionArrayAccess().contains(arrAcc) =>
// vec was already added to ctx and declared
val (vecTmp : String, false) = ctx.getName(expr)
IR_VariableAccess(vecTmp, SIMD_RealDatatype)
// TODO: do not vectorize if base is not aligned?
case IR_ArrayAccess(base, index, alignedBase) =>
val (vecTmp : String, njuTmp : Boolean) = ctx.getName(expr)
......
......@@ -31,19 +31,16 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
override def doWork() : Unit = {
val globals = IR_GlobalCollection.get
if (!Knowledge.experimental_eliminateCudaContext)
globals.variables += IR_VariableDeclaration("CUcontext", "cudaContext")
globals.variables += IR_VariableDeclaration("CUdevice", "cudaDevice")
val initFunc = globals.functions.find(_.name == "initGlobals").get.asInstanceOf[IR_Function]
initFunc.body ++= ListBuffer[IR_Statement](
IR_VariableDeclaration(IR_IntegerDatatype, "deviceCount", 0),
"cuDeviceGetCount(&deviceCount)",
"cudaGetDeviceCount(&deviceCount)",
IR_Assert(IR_Lower(Knowledge.cuda_deviceId, "deviceCount"),
ListBuffer("\"Invalid device id (\"", Knowledge.cuda_deviceId, "\") must be smaller than the number of devices (\"", "deviceCount", "\")\""),
IR_FunctionCall("exit", 1)),
s"cuDeviceGet(&cudaDevice, ${ Knowledge.cuda_deviceId })")
IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1)),
s"cudaSetDevice(${ Knowledge.cuda_deviceId })"
)
// print device info (name)
if (!Knowledge.testing_enabled) {
......@@ -53,10 +50,6 @@ object CUDA_AddGlobals extends NoTraversalStrategy("Extend globals for CUDA") {
IR_RawPrint("\"Using CUDA device \"", Knowledge.cuda_deviceId, "\": \"", "devProp.name", "std::endl"))
}
// create context
if (!Knowledge.experimental_eliminateCudaContext)
initFunc.body += "cuCtxCreate(&cudaContext, 0, cudaDevice)"
// set L1 cache and shared memory configuration for this device
if (Knowledge.cuda_useSharedMemory)
initFunc.body += "cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)"
......
......@@ -19,28 +19,8 @@
package exastencils.parallelization.api.cuda
import exastencils.base.ir._
import exastencils.config.Knowledge
import exastencils.prettyprinting.PpStream
/// CUDA_Init
case object CUDA_Init extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuInit(0);"
}
}
/// CUDA_Finalize
case object CUDA_Finalize extends CUDA_DeviceStatement {
override def prettyprint(out : PpStream) : Unit = {
// has to be done after all other de-initialization statements
if (!Knowledge.experimental_eliminateCudaContext)
out << "cuCtxDestroy(cudaContext);"
}
}
/// CUDA_DeviceSynchronize
case class CUDA_DeviceSynchronize() extends CUDA_HostStatement with IR_Expandable {
......
......@@ -48,16 +48,16 @@ case class CUDA_CheckError(var exp : IR_Expression) extends CUDA_HostStatement w
def print = IR_RawPrint("\"CUDA error in file (\"", "__FILE__", "\"), line (\"", "__LINE__", "\"): \"", status,
"\" -> \"", IR_FunctionCall(IR_ExternalFunctionReference("cudaGetErrorString"), status), "std::endl")
def printAndExit : ListBuffer[IR_Statement] = ListBuffer(print, IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
ListBuffer(
IR_VariableDeclaration(status, exp),
IR_IfCondition("cudaSuccess" Neq status,
print,
printAndExit,
ListBuffer(
IR_Assignment(status, IR_FunctionCall(IR_ExternalFunctionReference("cudaGetLastError"))),
IR_IfCondition("cudaSuccess" Neq status,
ListBuffer[IR_Statement](
print,
IR_FunctionCall(IR_ExternalFunctionReference("exit"), 1))
printAndExit
))))
}
}
......@@ -114,7 +114,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
loop.getAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION).contains(CUDA_Util.CUDA_BAND_START) =>
// remove the annotation first to guarantee single application of this transformation.
loop.annotate(CUDA_Util.CUDA_LOOP_ANNOTATION)
loop.removeAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION)
val parallelLoops = (x : IR_ForLoop) => {
x.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) &&
......@@ -140,11 +140,15 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
val kernelCount = kernelFunctions.counterMap.getOrElse(fctNameCollector.getCurrentName, -1) + 1
val reduction = loop.parallelization.reduction
val reduction = Duplicate(loop.parallelization.reduction)
val redTarget = if (reduction.isDefined)
Some(Duplicate(reduction.get.target))
else
None
// local variable for kernels with reductions
val localTarget = if (reduction.isDefined)
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(reduction.get.target)))
Some(IR_VariableAccess(reduction.get.targetName + "_local_" + kernelCount, CUDA_Util.getReductionDatatype(redTarget.get)))
else
None
......@@ -152,17 +156,17 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
CUDA_GatherVariableAccesses.clear()
CUDA_GatherVariableAccesses.kernelCount = kernelCount
if (reduction.isDefined)
CUDA_GatherVariableAccesses.reductionTarget = Some(reduction.get.target)
CUDA_GatherVariableAccesses.reductionTarget = redTarget
CUDA_GatherVariableAccesses.applyStandalone(IR_Scope(loop))
// declare and init local reduction target
if (localTarget.isDefined) {
var decl = IR_VariableDeclaration(localTarget.get)
var initLocalTarget = CUDA_Util.getReductionDatatype(reduction.get.target) match {
var initLocalTarget = CUDA_Util.getReductionDatatype(redTarget.get) match {
case _ : IR_ScalarDatatype =>
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, reduction.get.target))
ListBuffer[IR_Statement](IR_Assignment(localTarget.get, redTarget.get))
case mat : IR_MatrixDatatype =>
reduction.get.target match {
redTarget.get match {
case vAcc : IR_VariableAccess =>
IR_GenerateBasicMatrixOperations.loopSetSubmatrixMatPointer(
vAcc, localTarget.get, mat.sizeN, mat.sizeM, mat.sizeN, 0, 0).body
......@@ -218,7 +222,7 @@ object CUDA_ExtractHostAndDeviceCode extends DefaultStrategy("Transform annotate
// replace array accesses with accesses to function arguments
// reduction var is not replaced, but later in IR_HandleReductions
if (reduction.isDefined)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = Some(reduction.get.target)
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = redTarget
else
CUDA_ReplaceNonReductionVarArrayAccesses.reductionTarget = None
CUDA_ReplaceNonReductionVarArrayAccesses.applyStandalone(IR_Scope(kernelBody))
......
......@@ -23,6 +23,7 @@ import scala.collection.mutable._
import exastencils.base.ir._
import exastencils.core.collectors.Collector
import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._
import exastencils.logger.Logger
......@@ -85,6 +86,12 @@ class CUDA_GatherFieldAccess extends Collector {
}
}
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
if (isRead)
fieldAccesses.put("read_" + identifier, access)
if (isWrite)
......
......@@ -25,6 +25,7 @@ import exastencils.base.ir._
import exastencils.config._
import exastencils.datastructures.Transformation._
import exastencils.datastructures._
import exastencils.domain.ir.IR_IV_NeighborFragmentIdx
import exastencils.field.ir._
import exastencils.optimization.ir.IR_SimplifyExpression
......@@ -49,6 +50,12 @@ object CUDA_GatherFieldAccessLike extends QuietDefaultStrategy("Gather local Fie
}
}
// also consider neighbor fragment accesses
access.fragIdx match {
case neigh : IR_IV_NeighborFragmentIdx => identifier += s"_n${ neigh.neighIdx }"
case _ =>
}
identifier
}
......
......@@ -38,17 +38,18 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val iter = IR_LoopOverFragments.defIt
val redTarget = reduction.target
val red = Duplicate(reduction)
val redTarget = Duplicate(red.target)
val reductionDt = CUDA_Util.getReductionDatatype(redTarget)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(reduction.targetName)
val counter = CUDA_HandleFragmentLoopsWithReduction.getReductionCounter(red.targetName)
val copies = {
val innerDt = reductionDt match {
case scalar : IR_ScalarDatatype => scalar
case hodt : IR_HigherDimensionalDatatype => IR_ArrayDatatype(hodt.resolveBaseDatatype, hodt.getSizeArray.product)
}
IR_VariableAccess(reduction.targetName + "_" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
IR_VariableAccess(red.targetName + "_fragCpy" + counter, IR_ArrayDatatype(innerDt, Knowledge.domain_numFragmentsPerBlock))
}
val currCopy = IR_ArrayAccess(copies, iter)
......@@ -80,11 +81,17 @@ case class CUDA_HandleFragmentLoopsWithReduction(
matrixAssignment("std::copy", redTarget, currCopy, hodt.getSizeArray.product)
}
def initCopies() = ListBuffer(
IR_VariableDeclaration(copies), // declare copies
IR_LoopOverFragments( // init copies
copyReductionTarget()),
resetReductionTarget()) // reset initial value as it is already in the copies
def initCopies() = {
val declCopies = IR_VariableDeclaration(copies)
val initCopies = IR_LoopOverFragments(
copyReductionTarget()).expandSpecial().inner
val resetRedTarget = resetReductionTarget() // reset initial value as it is already in the copies
ListBuffer(
declCopies,
initCopies,
resetRedTarget)
}
def finalizeReduction(body : ListBuffer[IR_Statement]) = {
// finalize reduction
......@@ -99,10 +106,10 @@ case class CUDA_HandleFragmentLoopsWithReduction(
val src = IR_ArrayAccess(currCopy, idx)
IR_ForLoop(IR_VariableDeclaration(i, IR_IntegerConstant(0)), IR_Lower(i, mat.sizeM), IR_PreIncrement(i), ListBuffer[IR_Statement](
IR_ForLoop(IR_VariableDeclaration(j, 0), IR_Lower(j, mat.sizeN), IR_PreIncrement(j), ListBuffer[IR_Statement](
IR_Assignment(dst, IR_BinaryOperators.createExpression(reduction.op, dst, src))))))
IR_Assignment(dst, IR_BinaryOperators.createExpression(red.op, dst, src))))))
case _ : IR_ScalarDatatype =>
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(reduction.op, redTarget, currCopy))
IR_Assignment(redTarget, IR_BinaryOperators.createExpression(red.op, redTarget, currCopy))
}
body :+ assign
......@@ -110,21 +117,21 @@ case class CUDA_HandleFragmentLoopsWithReduction(
def replaceAccesses(body : ListBuffer[IR_Statement]) = {
// replace occurrences
CUDA_ReplaceReductionAccesses.redTarget = redTarget
CUDA_ReplaceReductionAccesses.replacement = currCopy
CUDA_ReplaceReductionAccesses.redTarget = Duplicate(redTarget)
CUDA_ReplaceReductionAccesses.replacement = Duplicate(currCopy)
CUDA_ReplaceReductionAccesses.applyStandalone(IR_Scope(body))
}
def addHandling(loop : IR_ForLoop) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
def addHandling(loop : IR_LoopOverFragments) = {
replaceAccesses(loop.body)
loop.body = finalizeReduction(loop.body)
initCopies() :+ loop
initCopies() :+ Duplicate(loop)
}
override def expand() : OutputType = {
......
......@@ -593,12 +593,12 @@ case class CUDA_Kernel(
var body = ListBuffer[IR_Statement]()
if (reduction.isDefined) {
def target = reduction.get.target
def resultDt = CUDA_Util.getReductionDatatype(target)
def baseDt = resultDt.resolveBaseDatatype
val target = Duplicate(reduction.get.target)
val resultDt = CUDA_Util.getReductionDatatype(target)
val baseDt = resultDt.resolveBaseDatatype
def bufSize = requiredThreadsPerDim.product
def bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
val bufSize = requiredThreadsPerDim.product
val bufAccess = CUDA_ReductionDeviceData(bufSize, resultDt)
var callArgsReduction = ListBuffer[IR_Expression](bufAccess, bufSize)
body += CUDA_Memset(bufAccess, 0, bufSize, resultDt)
......@@ -621,7 +621,7 @@ case class CUDA_Kernel(
IR_Return(Some(callDefaultReductionKernel))
})
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, target) // request reduction kernel and wrapper
CUDA_KernelFunctions.get.requiredRedKernels += Tuple2(reduction.get.op, Duplicate(target)) // request reduction kernel and wrapper
} else {
body += CUDA_FunctionCall(getKernelFctName, callArgs, numBlocksPerDim, numThreadsPerBlock)
}
......
......@@ -25,6 +25,8 @@ import exastencils.base.ir._
import exastencils.baseExt.ir._
import exastencils.communication.ir._
import exastencils.config.Knowledge
import exastencils.datastructures.DefaultStrategy
import exastencils.datastructures.Transformation
import exastencils.datastructures.Transformation.Output
import exastencils.field.ir._
......@@ -33,7 +35,37 @@ import exastencils.field.ir._
case class CUDA_Allocate(var pointer : IR_Expression, var numElements : IR_Expression, var datatype : IR_Datatype) extends CUDA_HostStatement with IR_Expandable {
override def expand() : Output[IR_Statement] = {
CUDA_CheckError(
IR_FunctionCall("cudaMalloc",
IR_FunctionCall(IR_ExternalFunctionReference("cudaMalloc"),
IR_Cast(IR_PointerDatatype(IR_PointerDatatype(IR_UnitDatatype)), IR_AddressOf(pointer)),
numElements * IR_SizeOf(datatype)))
}
}
/// CUDA_AllocateHost
case class CUDA_AllocateHost(var pointer : IR_Expression, var numElements : IR_Expression, var datatype : IR_Datatype) extends CUDA_HostStatement with IR_Expandable {
override def expand() : Output[IR_Statement] = {
if (Knowledge.cuda_useZeroCopy) {
CUDA_CheckError(
IR_FunctionCall(IR_ExternalFunctionReference("cudaHostAlloc"),
IR_Cast(IR_PointerDatatype(IR_PointerDatatype(IR_UnitDatatype)), IR_AddressOf(pointer)),
numElements * IR_SizeOf(datatype),
"cudaHostAllocMapped"))
} else {
CUDA_CheckError(
IR_FunctionCall(IR_ExternalFunctionReference("cudaMallocHost"),
IR_Cast(IR_PointerDatatype(IR_PointerDatatype(IR_UnitDatatype)), IR_AddressOf(pointer)),
numElements * IR_SizeOf(datatype)))
}
}
}
/// CUDA_AllocateManaged
case class CUDA_AllocateManaged(var pointer : IR_Expression, var numElements : IR_Expression, var datatype : IR_Datatype) extends CUDA_HostStatement with IR_Expandable {
override def expand() : Output[IR_Statement] = {
CUDA_CheckError(
IR_FunctionCall(IR_ExternalFunctionReference("cudaMallocManaged"),
IR_Cast(IR_PointerDatatype(IR_PointerDatatype(IR_UnitDatatype)), IR_AddressOf(pointer)),
numElements * IR_SizeOf(datatype)))
}
......@@ -42,19 +74,38 @@ case class CUDA_Allocate(var pointer : IR_Expression, var numElements : IR_Expre
/// CUDA_Free
case class CUDA_Free(var pointer : IR_Expression) extends CUDA_HostStatement with IR_Expandable {
override def expand() = IR_ExpressionStatement(IR_FunctionCall("cudaFree", pointer))
override def expand() = IR_ExpressionStatement(IR_FunctionCall(IR_ExternalFunctionReference("cudaFree"), pointer))
}
/// CUDA_FreeHost
case class CUDA_FreeHost(var pointer : IR_Expression) extends CUDA_HostStatement with IR_Expandable {
override def expand() = IR_ExpressionStatement(IR_FunctionCall(IR_ExternalFunctionReference("cudaFreeHost"), pointer))
}
/// CUDA_Memcpy
case class CUDA_Memcpy(var dest : IR_Expression, var src : IR_Expression, var sizeInBytes : IR_Expression, var direction : String) extends CUDA_HostStatement with IR_Expandable {
override def expand() = CUDA_CheckError(IR_FunctionCall("cudaMemcpy", dest, src, sizeInBytes, direction))
override def expand() = CUDA_CheckError(IR_FunctionCall(IR_ExternalFunctionReference("cudaMemcpy"), dest, src, sizeInBytes, direction))
}
/// CUDA_MemPrefetch
case class CUDA_MemPrefetch(var pointer : IR_Expression, var sizeInBytes : IR_Expression, var target : String) extends CUDA_HostStatement with IR_Expandable {
override def expand() = CUDA_CheckError(IR_FunctionCall(IR_ExternalFunctionReference("cudaMemPrefetchAsync "), pointer, sizeInBytes, target))
}
/// CUDA_Memset