Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 73 additions and 2025 deletions
**********************
Plotting and Animation
**********************
.. automodule:: pystencils.plot
:members:
File moved
***************************************
Assignment Collection & Simplifications
***************************************
AssignmentCollection
====================
.. autoclass:: pystencils.AssignmentCollection
:members:
SimplificationStrategy
======================
.. autoclass:: pystencils.simp.SimplificationStrategy
:members:
Simplifications
===============
.. automodule:: pystencils.simp.simplifications
:members:
Subexpression insertion
=======================
The subexpression insertions have the goal to insert subexpressions which will not reduce the number of FLOPs.
For example a constant value kept as subexpression will lead to a new variable in the code which will occupy
a register slot. On the other side a single variable could just be inserted in all assignments.
.. automodule:: pystencils.simp.subexpression_insertion
:members:
*******
Stencil
*******
.. automodule:: pystencils.stencil
:members:
Tutorials
=========
These tutorials are a good place to start if you are new to pystencils.
All tutorials and demos listed here are based on Jupyter notebooks that you can find in the pystencils repository.
It is a good idea to download them and run them directly to be able to play around with the code.
.. toctree::
:maxdepth: 1
/notebooks/01_tutorial_getting_started.ipynb
/notebooks/02_tutorial_basic_kernels.ipynb
/notebooks/03_tutorial_datahandling.ipynb
/notebooks/04_tutorial_advection_diffusion.ipynb
/notebooks/05_tutorial_phasefield_spinodal_decomposition.ipynb
/notebooks/06_tutorial_phasefield_dentritic_growth.ipynb
/notebooks/demo_assignment_collection.ipynb
/notebooks/demo_plotting_and_animation.ipynb
/notebooks/demo_derivatives.ipynb
/notebooks/demo_benchmark.ipynb
/notebooks/demo_wave_equation.ipynb
from pystencils.equationcollection.equationcollection import EquationCollection
from pystencils.equationcollection.simplificationstrategy import SimplificationStrategy
import sympy as sp
from copy import deepcopy
from pystencils.sympyextensions import fastSubs, countNumberOfOperations, sortEquationsTopologically
class EquationCollection(object):
"""
A collection of equations with subexpression definitions, also represented as equations,
that are used in the main equations. EquationCollections can be passed to simplification methods.
These simplification methods can change the subexpressions, but the number and
left hand side of the main equations themselves is not altered.
Additionally a dictionary of simplification hints is stored, which are set by the functions that create
equation collections to transport information to the simplification system.
:ivar mainEquations: list of sympy equations
:ivar subexpressions: list of sympy equations defining subexpressions used in main equations
:ivar simplificationHints: dictionary that is used to annotate the equation collection with hints that are
used by the simplification system. See documentation of the simplification rules for
potentially required hints and their meaning.
"""
# ----------------------------------------- Creation ---------------------------------------------------------------
def __init__(self, equations, subExpressions, simplificationHints=None, subexpressionSymbolNameGenerator=None):
self.mainEquations = equations
self.subexpressions = subExpressions
if simplificationHints is None:
simplificationHints = {}
self.simplificationHints = simplificationHints
if subexpressionSymbolNameGenerator is None:
self.subexpressionSymbolNameGenerator = SymbolGen()
else:
self.subexpressionSymbolNameGenerator = subexpressionSymbolNameGenerator
@property
def mainTerms(self):
return []
def copy(self, mainEquations=None, subexpressions=None):
res = deepcopy(self)
if mainEquations is not None:
res.mainEquations = mainEquations
if subexpressions is not None:
res.subexpressions = subexpressions
return res
def copyWithSubstitutionsApplied(self, substitutionDict, addSubstitutionsAsSubexpressions=False,
substituteOnLhs=True):
"""
Returns a new equation collection, where terms are substituted according to the passed `substitutionDict`.
Substitutions are made in the subexpression terms and the main equations
"""
if substituteOnLhs:
newSubexpressions = [fastSubs(eq, substitutionDict) for eq in self.subexpressions]
newEquations = [fastSubs(eq, substitutionDict) for eq in self.mainEquations]
else:
newSubexpressions = [sp.Eq(eq.lhs, fastSubs(eq.rhs, substitutionDict)) for eq in self.subexpressions]
newEquations = [sp.Eq(eq.lhs, fastSubs(eq.rhs, substitutionDict)) for eq in self.mainEquations]
if addSubstitutionsAsSubexpressions:
newSubexpressions = [sp.Eq(b, a) for a, b in substitutionDict.items()] + newSubexpressions
newSubexpressions = sortEquationsTopologically(newSubexpressions)
return self.copy(newEquations, newSubexpressions)
def addSimplificationHint(self, key, value):
"""
Adds an entry to the simplificationHints dictionary, and checks that is does not exist yet
"""
assert key not in self.simplificationHints, "This hint already exists"
self.simplificationHints[key] = value
# ---------------------------------------------- Properties -------------------------------------------------------
@property
def allEquations(self):
"""Subexpression and main equations in one sequence"""
return self.subexpressions + self.mainEquations
@property
def freeSymbols(self):
"""All symbols used in the equation collection, which have not been defined inside the equation system"""
freeSymbols = set()
for eq in self.allEquations:
freeSymbols.update(eq.rhs.atoms(sp.Symbol))
return freeSymbols - self.boundSymbols
@property
def boundSymbols(self):
"""Set of all symbols which occur on left-hand-sides i.e. all symbols which are defined."""
boundSymbolsSet = set([eq.lhs for eq in self.allEquations])
assert len(boundSymbolsSet) == len(self.subexpressions) + len(self.mainEquations), \
"Not in SSA form - same symbol assigned multiple times"
return boundSymbolsSet
@property
def definedSymbols(self):
"""All symbols that occur as left-hand-sides of the main equations"""
return set([eq.lhs for eq in self.mainEquations])
@property
def operationCount(self):
"""See :func:`countNumberOfOperations` """
return countNumberOfOperations(self.allEquations)
def get(self, symbols, fromMainEquationsOnly=False):
"""Return the equations which have symbols as left hand sides"""
if not hasattr(symbols, "__len__"):
symbols = list(symbols)
symbols = set(symbols)
if not fromMainEquationsOnly:
eqsToSearchIn = self.allEquations
else:
eqsToSearchIn = self.mainEquations
return [eq for eq in eqsToSearchIn if eq.lhs in symbols]
# ----------------------------------------- Display and Printing -------------------------------------------------
def _repr_html_(self):
def makeHtmlEquationTable(equations):
noBorder = 'style="border:none"'
htmlTable = '<table style="border:none; width: 100%; ">'
line = '<tr {nb}> <td {nb}>$${eq}$$</td> </tr> '
for eq in equations:
formatDict = {'eq': sp.latex(eq),
'nb': noBorder, }
htmlTable += line.format(**formatDict)
htmlTable += "</table>"
return htmlTable
result = ""
if len(self.subexpressions) > 0:
result += "<div>Subexpressions:</div>"
result += makeHtmlEquationTable(self.subexpressions)
result += "<div>Main Equations:</div>"
result += makeHtmlEquationTable(self.mainEquations)
return result
def __repr__(self):
return "Equation Collection for " + ",".join([str(eq.lhs) for eq in self.mainEquations])
def __str__(self):
result = "Subexpressions\n"
for eq in self.subexpressions:
result += str(eq) + "\n"
result += "Main Equations\n"
for eq in self.mainEquations:
result += str(eq) + "\n"
return result
# ------------------------------------- Manipulation ------------------------------------------------------------
def merge(self, other):
"""Returns a new collection which contains self and other. Subexpressions are renamed if they clash."""
ownDefs = set([e.lhs for e in self.mainEquations])
otherDefs = set([e.lhs for e in other.mainEquations])
assert len(ownDefs.intersection(otherDefs)) == 0, "Cannot merge, since both collection define the same symbols"
ownSubexpressionSymbols = {e.lhs: e.rhs for e in self.subexpressions}
substitutionDict = {}
processedOtherSubexpressionEquations = []
for otherSubexpressionEq in other.subexpressions:
if otherSubexpressionEq.lhs in ownSubexpressionSymbols:
if otherSubexpressionEq.rhs == ownSubexpressionSymbols[otherSubexpressionEq.lhs]:
continue # exact the same subexpression equation exists already
else:
# different definition - a new name has to be introduced
newLhs = next(self.subexpressionSymbolNameGenerator)
newEq = sp.Eq(newLhs, fastSubs(otherSubexpressionEq.rhs, substitutionDict))
processedOtherSubexpressionEquations.append(newEq)
substitutionDict[otherSubexpressionEq.lhs] = newLhs
else:
processedOtherSubexpressionEquations.append(fastSubs(otherSubexpressionEq, substitutionDict))
processedOtherMainEquations = [fastSubs(eq, substitutionDict) for eq in other.mainEquations]
return self.copy(self.mainEquations + processedOtherMainEquations,
self.subexpressions + processedOtherSubexpressionEquations)
def getDependentSymbols(self, symbolSequence):
"""Returns a list of symbols that depend on the passed symbols."""
queue = list(symbolSequence)
def addSymbolsFromExpr(expr):
dependentSymbols = expr.atoms(sp.Symbol)
for ds in dependentSymbols:
queue.append(ds)
handledSymbols = set()
eqMap = {e.lhs: e.rhs for e in self.allEquations}
while len(queue) > 0:
e = queue.pop(0)
if e in handledSymbols:
continue
if e in eqMap:
addSymbolsFromExpr(eqMap[e])
handledSymbols.add(e)
return handledSymbols
def extract(self, symbolsToExtract):
"""
Creates a new equation collection with equations that have symbolsToExtract as left-hand-sides and
only the necessary subexpressions that are used in these equations
"""
symbolsToExtract = set(symbolsToExtract)
dependentSymbols = self.getDependentSymbols(symbolsToExtract)
newEquations = []
for eq in self.allEquations:
if eq.lhs in symbolsToExtract:
newEquations.append(eq)
newSubExpr = [eq for eq in self.subexpressions if eq.lhs in dependentSymbols and eq.lhs not in symbolsToExtract]
return EquationCollection(newEquations, newSubExpr)
def newWithoutUnusedSubexpressions(self):
"""Returns a new equation collection containing only the subexpressions that
are used/referenced in the equations"""
allLhs = [eq.lhs for eq in self.mainEquations]
return self.extract(allLhs)
def insertSubexpression(self, symbol):
newSubexpressions = []
subsDict = None
for se in self.subexpressions:
if se.lhs == symbol:
subsDict = {se.lhs: se.rhs}
else:
newSubexpressions.append(se)
if subsDict is None:
return self
newSubexpressions = [sp.Eq(eq.lhs, fastSubs(eq.rhs, subsDict)) for eq in newSubexpressions]
newEqs = [sp.Eq(eq.lhs, fastSubs(eq.rhs, subsDict)) for eq in self.mainEquations]
return self.copy(newEqs, newSubexpressions)
def insertSubexpressions(self, subexpressionSymbolsToKeep=set()):
"""Returns a new equation collection by inserting all subexpressions into the main equations"""
if len(self.subexpressions) == 0:
return self.copy()
subexpressionSymbolsToKeep = set(subexpressionSymbolsToKeep)
keptSubexpressions = []
if self.subexpressions[0].lhs in subexpressionSymbolsToKeep:
subsDict = {}
keptSubexpressions = self.subexpressions[0]
else:
subsDict = {self.subexpressions[0].lhs: self.subexpressions[0].rhs}
subExpr = [e for e in self.subexpressions]
for i in range(1, len(subExpr)):
subExpr[i] = fastSubs(subExpr[i], subsDict)
if subExpr[i].lhs in subexpressionSymbolsToKeep:
keptSubexpressions.append(subExpr[i])
else:
subsDict[subExpr[i].lhs] = subExpr[i].rhs
newEq = [fastSubs(eq, subsDict) for eq in self.mainEquations]
return self.copy(newEq, keptSubexpressions)
def lambdify(self, symbols, module=None, fixedSymbols={}):
"""
Returns a function to evaluate this equation collection
:param symbols: symbol(s) which are the parameter for the created function
:param module: same as sympy.lambdify paramter of same same, i.e. which module to use e.g. 'numpy'
:param fixedSymbols: dictionary with substitutions, that are applied before lambdification
"""
eqs = self.copyWithSubstitutionsApplied(fixedSymbols).insertSubexpressions().mainEquations
lambdas = {eq.lhs: sp.lambdify(symbols, eq.rhs, module) for eq in eqs}
def f(*args, **kwargs):
return {s: f(*args, **kwargs) for s, f in lambdas.items()}
return f
class SymbolGen:
def __init__(self):
self._ctr = 0
def __iter__(self):
return self
def __next__(self):
self._ctr += 1
return sp.Symbol("xi_" + str(self._ctr))
def next(self):
return self.__next__()
import sympy as sp
from pystencils.equationcollection.equationcollection import EquationCollection
from pystencils.sympyextensions import replaceAdditive
def sympyCseOnEquationList(eqs):
ec = EquationCollection(eqs, [])
return sympyCSE(ec).allEquations
def sympyCSE(equationCollection):
"""
Searches for common subexpressions inside the equation collection, in both the existing subexpressions as well
as the equations themselves. It uses the sympy subexpression detection to do this. Return a new equation collection
with the additional subexpressions found
"""
symbolGen = equationCollection.subexpressionSymbolNameGenerator
replacements, newEq = sp.cse(equationCollection.subexpressions + equationCollection.mainEquations,
symbols=symbolGen)
replacementEqs = [sp.Eq(*r) for r in replacements]
modifiedSubexpressions = newEq[:len(equationCollection.subexpressions)]
modifiedUpdateEquations = newEq[len(equationCollection.subexpressions):]
newSubexpressions = replacementEqs + modifiedSubexpressions
topologicallySortedPairs = sp.cse_main.reps_toposort([[e.lhs, e.rhs] for e in newSubexpressions])
newSubexpressions = [sp.Eq(a[0], a[1]) for a in topologicallySortedPairs]
return equationCollection.copy(modifiedUpdateEquations, newSubexpressions)
def applyOnAllEquations(equationCollection, operation):
"""Applies sympy expand operation to all equations in collection"""
result = [sp.Eq(eq.lhs, operation(eq.rhs)) for eq in equationCollection.mainEquations]
return equationCollection.copy(result)
def applyOnAllSubexpressions(equationCollection, operation):
result = [sp.Eq(eq.lhs, operation(eq.rhs)) for eq in equationCollection.subexpressions]
return equationCollection.copy(equationCollection.mainEquations, result)
def subexpressionSubstitutionInExistingSubexpressions(equationCollection):
"""Goes through the subexpressions list and replaces the term in the following subexpressions"""
result = []
for outerCtr, s in enumerate(equationCollection.subexpressions):
newRhs = s.rhs
for innerCtr in range(outerCtr):
subExpr = equationCollection.subexpressions[innerCtr]
newRhs = replaceAdditive(newRhs, subExpr.lhs, subExpr.rhs, requiredMatchReplacement=1.0)
newRhs = newRhs.subs(subExpr.rhs, subExpr.lhs)
result.append(sp.Eq(s.lhs, newRhs))
return equationCollection.copy(equationCollection.mainEquations, result)
def subexpressionSubstitutionInMainEquations(equationCollection):
"""Replaces already existing subexpressions in the equations of the equationCollection"""
result = []
for s in equationCollection.mainEquations:
newRhs = s.rhs
for subExpr in equationCollection.subexpressions:
newRhs = replaceAdditive(newRhs, subExpr.lhs, subExpr.rhs, requiredMatchReplacement=1.0)
result.append(sp.Eq(s.lhs, newRhs))
return equationCollection.copy(result)
def addSubexpressionsForDivisions(equationCollection):
"""Introduces subexpressions for all divisions which have no constant in the denominator.
e.g. :math:`\frac{1}{x}` is replaced, :math:`\frac{1}{3}` is not replaced."""
divisors = set()
def searchDivisors(term):
if term.func == sp.Pow:
if term.exp.is_integer and term.exp.is_number and term.exp < 0:
divisors.add(term)
else:
for a in term.args:
searchDivisors(a)
for eq in equationCollection.allEquations:
searchDivisors(eq.rhs)
newSymbolGen = equationCollection.subexpressionSymbolNameGenerator
substitutions = {divisor: newSymbol for newSymbol, divisor in zip(newSymbolGen, divisors)}
return equationCollection.copyWithSubstitutionsApplied(substitutions, True)
from itertools import chain
import numpy as np
import sympy as sp
from sympy.core.cache import cacheit
from sympy.tensor import IndexedBase
from pystencils.data_types import TypedSymbol, createType, createCompositeTypeFromString
from pystencils.sympyextensions import isIntegerSequence
class Field(object):
"""
With fields one can formulate stencil-like update rules on structured grids.
This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
Creating Fields:
To create a field use one of the static create* members. There are two options:
1. create a kernel with fixed loop sizes i.e. the shape of the array is already known. This is usually the
case if just-in-time compilation directly from Python is done. (see :func:`Field.createFromNumpyArray`)
2. create a more general kernel that works for variable array sizes. This can be used to create kernels
beforehand for a library. (see :func:`Field.createGeneric`)
Dimensions:
A field has spatial and index dimensions, where the spatial dimensions come first.
The interpretation is that the field has multiple cells in (usually) two or three dimensional space which are
looped over. Additionally N values are stored per cell. In this case spatialDimensions is two or three,
and indexDimensions equals N. If you want to store a matrix on each point in a two dimensional grid, there
are four dimensions, two spatial and two index dimensions: ``len(arr.shape) == spatialDims + indexDims``
Indexing:
When accessing (indexing) a field the result is a FieldAccess which is derived from sympy Symbol.
First specify the spatial offsets in [], then in case indexDimension>0 the indices in ()
e.g. ``f[-1,0,0](7)``
Example without index dimensions:
>>> a = np.zeros([10, 10])
>>> f = Field.createFromNumpyArray("f", a, indexDimensions=0)
>>> jacobi = ( f[-1,0] + f[1,0] + f[0,-1] + f[0,1] ) / 4
Example with index dimensions: LBM D2Q9 stream pull
>>> stencil = np.array([[0,0], [0,1], [0,-1]])
>>> src = Field.createGeneric("src", spatialDimensions=2, indexDimensions=1)
>>> dst = Field.createGeneric("dst", spatialDimensions=2, indexDimensions=1)
>>> for i, offset in enumerate(stencil):
... sp.Eq(dst[0,0](i), src[-offset](i))
Eq(dst_C^0, src_C^0)
Eq(dst_C^1, src_S^1)
Eq(dst_C^2, src_N^2)
"""
@staticmethod
def createGeneric(fieldName, spatialDimensions, dtype=np.float64, indexDimensions=0, layout='numpy',
indexShape=None):
"""
Creates a generic field where the field size is not fixed i.e. can be called with arrays of different sizes
:param fieldName: symbolic name for the field
:param dtype: numpy data type of the array the kernel is called with later
:param spatialDimensions: see documentation of Field
:param indexDimensions: see documentation of Field
:param layout: tuple specifying the loop ordering of the spatial dimensions e.g. (2, 1, 0 ) means that
the outer loop loops over dimension 2, the second outer over dimension 1, and the inner loop
over dimension 0. Also allowed: the strings 'numpy' (0,1,..d) or 'reverseNumpy' (d, ..., 1, 0)
:param indexShape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension,
has to be a list or tuple
"""
if isinstance(layout, str):
layout = spatialLayoutStringToTuple(layout, dim=spatialDimensions)
shapeSymbol = IndexedBase(TypedSymbol(Field.SHAPE_PREFIX + fieldName, Field.SHAPE_DTYPE), shape=(1,))
strideSymbol = IndexedBase(TypedSymbol(Field.STRIDE_PREFIX + fieldName, Field.STRIDE_DTYPE), shape=(1,))
totalDimensions = spatialDimensions + indexDimensions
if indexShape is None or len(indexShape) == 0:
shape = tuple([shapeSymbol[i] for i in range(totalDimensions)])
else:
shape = tuple([shapeSymbol[i] for i in range(spatialDimensions)] + list(indexShape))
assert len(shape) == totalDimensions
strides = tuple([strideSymbol[i] for i in range(totalDimensions)])
npDataType = np.dtype(dtype)
if npDataType.fields is not None:
if indexDimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
return Field(fieldName, dtype, layout, shape, strides)
@staticmethod
def createFromNumpyArray(fieldName, npArray, indexDimensions=0):
"""
Creates a field based on the layout, data type, and shape of a given numpy array.
Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type.
:param fieldName: symbolic name for the field
:param npArray: numpy array
:param indexDimensions: see documentation of Field
"""
spatialDimensions = len(npArray.shape) - indexDimensions
if spatialDimensions < 1:
raise ValueError("Too many index dimensions. At least one spatial dimension required")
fullLayout = getLayoutOfArray(npArray)
spatialLayout = tuple([i for i in fullLayout if i < spatialDimensions])
assert len(spatialLayout) == spatialDimensions
strides = tuple([s // np.dtype(npArray.dtype).itemsize for s in npArray.strides])
shape = tuple(int(s) for s in npArray.shape)
npDataType = np.dtype(npArray.dtype)
if npDataType.fields is not None:
if indexDimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
return Field(fieldName, npArray.dtype, spatialLayout, shape, strides)
@staticmethod
def createFixedSize(fieldName, shape, indexDimensions=0, dtype=np.float64, layout='numpy'):
"""
Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout
:param fieldName: symbolic name for the field
:param shape: overall shape of the array
:param indexDimensions: how many of the trailing dimensions are interpreted as index (as opposed to spatial)
:param dtype: numpy data type of the array the kernel is called with later
:param layout: full layout of array, not only spatial dimensions
"""
spatialDimensions = len(shape) - indexDimensions
assert spatialDimensions >= 1
if isinstance(layout, str):
layout = layoutStringToTuple(layout, spatialDimensions + indexDimensions)
shape = tuple(int(s) for s in shape)
strides = computeStrides(shape, layout)
npDataType = np.dtype(dtype)
if npDataType.fields is not None:
if indexDimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
spatialLayout = list(layout)
for i in range(spatialDimensions, len(layout)):
spatialLayout.remove(i)
return Field(fieldName, dtype, tuple(spatialLayout), shape, strides)
def __init__(self, fieldName, dtype, layout, shape, strides):
"""Do not use directly. Use static create* methods"""
self._fieldName = fieldName
self._dtype = createType(dtype)
self._layout = normalizeLayout(layout)
self.shape = shape
self.strides = strides
# index fields are currently only used for boundary handling
# the coordinates are not the loop counters in that case, but are read from this index field
self.isIndexField = False
def newFieldWithDifferentName(self, newName):
return Field(newName, self._dtype, self._layout, self.shape, self.strides)
@property
def spatialDimensions(self):
return len(self._layout)
@property
def indexDimensions(self):
return len(self.shape) - len(self._layout)
@property
def layout(self):
return self._layout
@property
def name(self):
return self._fieldName
@property
def spatialShape(self):
return self.shape[:self.spatialDimensions]
@property
def indexShape(self):
return self.shape[self.spatialDimensions:]
@property
def hasFixedShape(self):
return isIntegerSequence(self.shape)
@property
def indexShape(self):
return self.shape[self.spatialDimensions:]
@property
def hasFixedIndexShape(self):
return isIntegerSequence(self.indexShape)
@property
def spatialStrides(self):
return self.strides[:self.spatialDimensions]
@property
def indexStrides(self):
return self.strides[self.spatialDimensions:]
@property
def dtype(self):
return self._dtype
def __repr__(self):
return self._fieldName
def neighbor(self, coordId, offset):
offsetList = [0] * self.spatialDimensions
offsetList[coordId] = offset
return Field.Access(self, tuple(offsetList))
def neighbors(self, stencil):
return [ self.__getitem__(dir) for s in stencil]
@property
def center(self):
center = tuple([0] * self.spatialDimensions)
return Field.Access(self, center)
def __getitem__(self, offset):
if type(offset) is np.ndarray:
offset = tuple(offset)
if type(offset) is str:
offset = tuple(directionStringToOffset(offset, self.spatialDimensions))
if type(offset) is not tuple:
offset = (offset,)
if len(offset) != self.spatialDimensions:
raise ValueError("Wrong number of spatial indices: "
"Got %d, expected %d" % (len(offset), self.spatialDimensions))
return Field.Access(self, offset)
def __call__(self, *args, **kwargs):
center = tuple([0]*self.spatialDimensions)
return Field.Access(self, center)(*args, **kwargs)
def __hash__(self):
return hash((self._layout, self.shape, self.strides, self._dtype, self._fieldName))
def __eq__(self, other):
selfTuple = (self.shape, self.strides, self.name, self.dtype)
otherTuple = (other.shape, other.strides, other.name, other.dtype)
return selfTuple == otherTuple
PREFIX = "f"
STRIDE_PREFIX = PREFIX + "stride_"
SHAPE_PREFIX = PREFIX + "shape_"
STRIDE_DTYPE = createCompositeTypeFromString("const int *")
SHAPE_DTYPE = createCompositeTypeFromString("const int *")
DATA_PREFIX = PREFIX + "d_"
class Access(sp.Symbol):
def __new__(cls, name, *args, **kwargs):
obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
return obj
def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None):
fieldName = field.name
offsetsAndIndex = chain(offsets, idx) if idx is not None else offsets
constantOffsets = not any([isinstance(o, sp.Basic) for o in offsetsAndIndex])
if not idx:
idx = tuple([0] * field.indexDimensions)
if constantOffsets:
offsetName = offsetToDirectionString(offsets)
if field.indexDimensions == 0:
superscript = None
elif field.indexDimensions == 1:
superscript = str(idx[0])
else:
idxStr = ",".join([str(e) for e in idx])
superscript = idxStr
else:
offsetName = "%0.10X" % (abs(hash(tuple(offsetsAndIndex))))
superscript = None
symbolName = "%s_%s" % (fieldName, offsetName)
if superscript is not None:
symbolName += "^" + superscript
obj = super(Field.Access, self).__xnew__(self, symbolName)
obj._field = field
obj._offsets = []
for o in offsets:
if isinstance(o, sp.Basic):
obj._offsets.append(o)
else:
obj._offsets.append(int(o))
obj._offsetName = offsetName
obj._superscript = superscript
obj._index = idx
return obj
def __getnewargs__(self):
return self.field, self.offsets, self.index
__xnew__ = staticmethod(__new_stage2__)
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __call__(self, *idx):
if self._index != tuple([0]*self.field.indexDimensions):
print(self._index, tuple([0]*self.field.indexDimensions))
raise ValueError("Indexing an already indexed Field.Access")
idx = tuple(idx)
if len(idx) != self.field.indexDimensions and idx != (0,):
raise ValueError("Wrong number of indices: "
"Got %d, expected %d" % (len(idx), self.field.indexDimensions))
return Field.Access(self.field, self._offsets, idx)
def __getitem__(self, *idx):
return self.__call__(*idx)
def __iter__(self):
"""This is necessary to work with parts of sympy that test if an object is iterable (e.g. simplify).
The __getitem__ would make it iterable"""
raise TypeError("Field access is not iterable")
@property
def field(self):
return self._field
@property
def offsets(self):
return self._offsets
@offsets.setter
def offsets(self, value):
self._offsets = value
@property
def requiredGhostLayers(self):
return int(np.max(np.abs(self._offsets)))
@property
def nrOfCoordinates(self):
return len(self._offsets)
@property
def offsetName(self):
return self._offsetName
def _latex(self, arg):
if self._superscript:
return "{{%s}_{%s}^{%s}}" % (self._field.name, self._offsetName, self._superscript)
else:
return "{{%s}_{%s}}" % (self._field.name, self._offsetName)
@property
def index(self):
return self._index
def getNeighbor(self, *offsets):
return Field.Access(self.field, offsets, self.index)
def getShifted(self, *shift):
return Field.Access(self.field, tuple(a + b for a, b in zip(shift, self.offsets)), self.index)
def _hashable_content(self):
superClassContents = list(super(Field.Access, self)._hashable_content())
t = tuple(superClassContents + [hash(self._field), self._index] + self._offsets)
return t
def extractCommonSubexpressions(equations):
"""
Uses sympy to find common subexpressions in equations and returns
them in a topologically sorted order, ready for evaluation.
Usually called before list of equations is passed to :func:`createKernel`
"""
replacements, newEq = sp.cse(equations)
# Workaround for older sympy versions: here subexpressions (temporary = True) are extracted
# which leads to problems in Piecewise functions which have to a default case indicated by True
symbolsEqualToTrue = {r[0]: True for r in replacements if r[1] is sp.true}
replacementEqs = [sp.Eq(*r) for r in replacements if r[1] is not sp.true]
equations = replacementEqs + newEq
topologicallySortedPairs = sp.cse_main.reps_toposort([[e.lhs, e.rhs] for e in equations])
equations = [sp.Eq(a[0], a[1].subs(symbolsEqualToTrue)) for a in topologicallySortedPairs]
return equations
def getLayoutFromStrides(strides, indexDimensionIds=[]):
coordinates = list(range(len(strides)))
relevantStrides = [stride for i, stride in enumerate(strides) if i not in indexDimensionIds]
result = [x for (y, x) in sorted(zip(relevantStrides, coordinates), key=lambda pair: pair[0], reverse=True)]
return normalizeLayout(result)
def getLayoutOfArray(arr, indexDimensionIds=[]):
"""
Returns a list indicating the memory layout (linearization order) of the numpy array.
Example:
>>> getLayoutOfArray(np.zeros([3,3,3]))
(0, 1, 2)
In this example the loop over the zeroth coordinate should be the outermost loop,
followed by the first and second. Elements arr[x,y,0] and arr[x,y,1] are adjacent in memory.
Normally constructed numpy arrays have this order, however by stride tricks or other frameworks, arrays
with different memory layout can be created.
The indexDimensionIds parameter leaves specifies which coordinates should not be
"""
return getLayoutFromStrides(arr.strides, indexDimensionIds)
def createNumpyArrayWithLayout(shape, layout):
"""
Creates a numpy array with
:param shape: shape of the resulting array
:param layout: layout as tuple, where the coordinates are ordered from slow to fast
>>> res = createNumpyArrayWithLayout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
>>> res.shape
(2, 3, 4, 5)
>>> getLayoutOfArray(res)
(3, 2, 0, 1)
"""
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
currentLayout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if currentLayout[i] != layout[i]:
indexToSwapWith = currentLayout.index(layout[i])
swaps.append((i, indexToSwapWith))
currentLayout[i], currentLayout[indexToSwapWith] = currentLayout[indexToSwapWith], currentLayout[i]
assert tuple(currentLayout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
res = np.empty(shape, order='c')
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
def spatialLayoutStringToTuple(layoutStr, dim):
if layoutStr in ('fzyx', 'zyxf'):
assert dim <= 3
return tuple(reversed(range(dim)))
if layoutStr in ('fzyx', 'f', 'reverseNumpy', 'SoA'):
return tuple(reversed(range(dim)))
elif layoutStr in ('c', 'numpy', 'AoS'):
return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layoutStr)
def layoutStringToTuple(layoutStr, dim):
layoutStr = layoutStr.lower()
if layoutStr == 'fzyx' or layoutStr == 'soa':
assert dim <= 4
return tuple(reversed(range(dim)))
elif layoutStr == 'zyxf' or layoutStr == 'aos':
assert dim <= 4
return tuple(reversed(range(dim - 1))) + (dim-1,)
elif layoutStr == 'f' or layoutStr == 'reverseNumpy':
return tuple(reversed(range(dim)))
elif layoutStr == 'c' or layoutStr == 'numpy':
return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layoutStr)
def normalizeLayout(layout):
"""Takes a layout tuple and subtracts the minimum from all entries"""
minEntry = min(layout)
return tuple(i - minEntry for i in layout)
def computeStrides(shape, layout):
"""
Computes strides assuming no padding exists
:param shape: shape (size) of array
:param layout: layout specification as tuple
:return: strides in elements, not in bytes
"""
N = len(shape)
assert len(layout) == N
assert len(set(layout)) == N
strides = [0] * N
product = 1
for j in reversed(layout):
strides[j] = product
product *= shape[j]
return tuple(strides)
def offsetComponentToDirectionString(coordinateId, value):
"""
Translates numerical offset to string notation.
x offsets are labeled with east 'E' and 'W',
y offsets with north 'N' and 'S' and
z offsets with top 'T' and bottom 'B'
If the absolute value of the offset is bigger than 1, this number is prefixed.
:param coordinateId: integer 0, 1 or 2 standing for x,y and z
:param value: integer offset
Example:
>>> offsetComponentToDirectionString(0, 1)
'E'
>>> offsetComponentToDirectionString(1, 2)
'2N'
"""
nameComponents = (('W', 'E'), # west, east
('S', 'N'), # south, north
('B', 'T'), # bottom, top
)
if value == 0:
result = ""
elif value < 0:
result = nameComponents[coordinateId][0]
else:
result = nameComponents[coordinateId][1]
if abs(value) > 1:
result = "%d%s" % (abs(value), result)
return result
def offsetToDirectionString(offsetTuple):
"""
Translates numerical offset to string notation.
For details see :func:`offsetComponentToDirectionString`
:param offsetTuple: 3-tuple with x,y,z offset
Example:
>>> offsetToDirectionString([1, -1, 0])
'SE'
>>> offsetToDirectionString(([-3, 0, -2]))
'2B3W'
"""
names = ["", "", ""]
for i in range(len(offsetTuple)):
names[i] = offsetComponentToDirectionString(i, offsetTuple[i])
name = "".join(reversed(names))
if name == "":
name = "C"
return name
def directionStringToOffset(directionStr, dim=3):
"""
Reverse mapping of :func:`offsetToDirectionString`
:param directionStr: string representation of offset
:param dim: dimension of offset, i.e the length of the returned list
>>> directionStringToOffset('NW', dim=3)
array([-1, 1, 0])
>>> directionStringToOffset('NW', dim=2)
array([-1, 1])
>>> directionStringToOffset(offsetToDirectionString([3,-2,1]))
array([ 3, -2, 1])
"""
offsetMap = {
'C': np.array([0, 0, 0]),
'W': np.array([-1, 0, 0]),
'E': np.array([1, 0, 0]),
'S': np.array([0, -1, 0]),
'N': np.array([0, 1, 0]),
'B': np.array([0, 0, -1]),
'T': np.array([0, 0, 1]),
}
offset = np.array([0, 0, 0])
while len(directionStr) > 0:
factor = 1
firstNonDigit = 0
while directionStr[firstNonDigit].isdigit():
firstNonDigit += 1
if firstNonDigit > 0:
factor = int(directionStr[:firstNonDigit])
directionStr = directionStr[firstNonDigit:]
curOffset = offsetMap[directionStr[0]]
offset += factor * curOffset
directionStr = directionStr[1:]
return offset[:dim]
if __name__ == '__main__':
f = Field.createGeneric('f', spatialDimensions=2, indexDimensions=1)
fa = f[0, 1](4) ** 2
print(fa)
print(sp.latex(fa))
\ No newline at end of file
from pystencils.gpucuda.kernelcreation import createCUDAKernel, createdIndexedCUDAKernel
from pystencils.gpucuda.cudajit import makePythonFunction
import numpy as np
from pystencils.backends.cbackend import generateC
from pystencils.transformations import symbolNameToVariableName
from pystencils.data_types import StructType, getBaseType
def makePythonFunction(kernelFunctionNode, argumentDict={}):
"""
Creates a kernel function from an abstract syntax tree which
was created e.g. by :func:`pystencils.gpucuda.createCUDAKernel`
or :func:`pystencils.gpucuda.createdIndexedCUDAKernel`
:param kernelFunctionNode: the abstract syntax tree
:param argumentDict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
:return: kernel functor
"""
import pycuda.autoinit
from pycuda.compiler import SourceModule
code = "#include <cstdint>\n"
code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n"
code += str(generateC(kernelFunctionNode))
mod = SourceModule(code, options=["-w", "-std=c++11"])
func = mod.get_function(kernelFunctionNode.functionName)
parameters = kernelFunctionNode.parameters
cache = {}
cacheValues = []
def wrapper(**kwargs):
key = hash(tuple((k, id(v)) for k, v in kwargs.items()))
try:
args, dictWithBlockAndThreadNumbers = cache[key]
func(*args, **dictWithBlockAndThreadNumbers)
except KeyError:
fullArguments = argumentDict.copy()
fullArguments.update(kwargs)
shape = _checkArguments(parameters, fullArguments)
indexing = kernelFunctionNode.indexing
dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
dictWithBlockAndThreadNumbers['block'] = tuple(int(i) for i in dictWithBlockAndThreadNumbers['block'])
dictWithBlockAndThreadNumbers['grid'] = tuple(int(i) for i in dictWithBlockAndThreadNumbers['grid'])
args = _buildNumpyArgumentList(parameters, fullArguments)
cache[key] = (args, dictWithBlockAndThreadNumbers)
cacheValues.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **dictWithBlockAndThreadNumbers)
#cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
return wrapper
def _buildNumpyArgumentList(parameters, argumentDict):
import pycuda.driver as cuda
argumentDict = {symbolNameToVariableName(k): v for k, v in argumentDict.items()}
result = []
for arg in parameters:
if arg.isFieldArgument:
field = argumentDict[arg.fieldName]
if arg.isFieldPtrArgument:
actualType = field.dtype
expectedType = arg.dtype.baseType.numpyDtype
if expectedType != actualType:
raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
(arg.fieldName, expectedType, actualType))
result.append(field)
elif arg.isFieldStrideArgument:
dtype = getBaseType(arg.dtype).numpyDtype
strideArr = np.array(field.strides, dtype=dtype) // field.dtype.itemsize
result.append(cuda.In(strideArr))
elif arg.isFieldShapeArgument:
dtype = getBaseType(arg.dtype).numpyDtype
shapeArr = np.array(field.shape, dtype=dtype)
result.append(cuda.In(shapeArr))
else:
assert False
else:
param = argumentDict[arg.name]
expectedType = arg.dtype.numpyDtype
result.append(expectedType.type(param))
assert len(result) == len(parameters)
return result
def _checkArguments(parameterSpecification, argumentDict):
"""
Checks if parameters passed to kernel match the description in the AST function node.
If not it raises a ValueError, on success it returns the array shape that determines the CUDA blocks and threads
"""
argumentDict = {symbolNameToVariableName(k): v for k, v in argumentDict.items()}
arrayShapes = set()
indexArrShapes = set()
for arg in parameterSpecification:
if arg.isFieldArgument:
try:
fieldArr = argumentDict[arg.fieldName]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + arg.fieldName)
symbolicField = arg.field
if arg.isFieldPtrArgument:
if symbolicField.hasFixedShape:
symbolicFieldShape = tuple(int(i) for i in symbolicField.shape)
if isinstance(symbolicField.dtype, StructType):
symbolicFieldShape = symbolicFieldShape[:-1]
if symbolicFieldShape != fieldArr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(arg.fieldName, str(fieldArr.shape), str(symbolicField.shape)))
if symbolicField.hasFixedShape:
symbolicFieldStrides = tuple(int(i) * fieldArr.dtype.itemsize for i in symbolicField.strides)
if isinstance(symbolicField.dtype, StructType):
symbolicFieldStrides = symbolicFieldStrides[:-1]
if symbolicFieldStrides != fieldArr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(arg.fieldName, str(fieldArr.strides), str(symbolicFieldStrides)))
if symbolicField.isIndexField:
indexArrShapes.add(fieldArr.shape[:symbolicField.spatialDimensions])
else:
arrayShapes.add(fieldArr.shape[:symbolicField.spatialDimensions])
if len(arrayShapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(arrayShapes))
if len(indexArrShapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(arrayShapes))
if len(indexArrShapes) > 0:
return list(indexArrShapes)[0]
else:
return list(arrayShapes)[0]
import abc
import sympy as sp
from pystencils.astnodes import Conditional, Block
from pystencils.slicing import normalizeSlice
from pystencils.data_types import TypedSymbol, createType
from functools import partial
AUTO_BLOCKSIZE_LIMITING = True
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, createType("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, createType("int")) for coord in ('x', 'y', 'z')]
class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
"""
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
"""
@abc.abstractproperty
def coordinates(self):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
These symbolic indices can be obtained with the method `indexVariables` """
@property
def indexVariables(self):
"""Sympy symbols for CUDA's block and thread indices"""
return BLOCK_IDX + THREAD_IDX
@abc.abstractmethod
def getCallParameters(self, arrShape):
"""
Determine grid and block size for kernel call
:param arrShape: the numeric (not symbolic) shape of the array
:return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
the kernel should be started with
"""
@abc.abstractmethod
def guard(self, kernelContent, arrShape):
"""
In some indexing schemes not all threads of a block execute the kernel content.
This function can return a Conditional ast node, defining this execution guard.
:param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
:param arrShape: the numeric or symbolic shape of the field
:return: ast node, which is put inside the kernel function
"""
# -------------------------------------------- Implementations ---------------------------------------------------------
class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
def __init__(self, field, iterationSlice=None,
blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
"""
Creates
:param field: pystencils field (common to all Indexing classes)
:param iterationSlice: slice that defines rectangular subarea which is iterated over
:param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
gets the largest amount of threads
"""
if field.spatialDimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if permuteBlockSizeDependentOnLayout:
blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)
if AUTO_BLOCKSIZE_LIMITING:
blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
self._blockSize = blockSize
self._iterationSlice = normalizeSlice(iterationSlice, field.spatialShape)
self._dim = field.spatialDimensions
self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatialShape]
@property
def coordinates(self):
offsets = _getStartFromSlice(self._iterationSlice)
coordinates = [blockIndex * bs + threadIdx + off
for blockIndex, bs, threadIdx, off in zip(BLOCK_IDX, self._blockSize, THREAD_IDX, offsets)]
return coordinates[:self._dim]
def getCallParameters(self, arrShape):
substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
_getEndFromSlice(self._iterationSlice, arrShape))]
widths = sp.Matrix(widths).subs(substitutionDict)
grid = tuple(sp.ceiling(length / blockSize) for length, blockSize in zip(widths, self._blockSize))
extendBs = (1,) * (3 - len(self._blockSize))
extendGr = (1,) * (3 - len(grid))
return {'block': self._blockSize + extendBs,
'grid': grid + extendGr}
def guard(self, kernelContent, arrShape):
arrShape = arrShape[:self._dim]
conditions = [c < end
for c, end in zip(self.coordinates, _getEndFromSlice(self._iterationSlice, arrShape))]
condition = conditions[0]
for c in conditions[1:]:
condition = sp.And(condition, c)
return Block([Conditional(condition, kernelContent)])
@staticmethod
def limitBlockSizeToDeviceMaximum(blockSize):
"""
Changes blocksize according to match device limits according to the following rules:
1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
component is multiplied by 2, such that the total amount of threads stays the same
Returns the altered blockSize
"""
# Get device limits
import pycuda.driver as cuda
import pycuda.autoinit
da = cuda.device_attribute
device = cuda.Context.get_device()
blockSize = list(blockSize)
maxThreads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
maxBlockSize = [device.get_attribute(a)
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
def prod(seq):
result = 1
for e in seq:
result *= e
return result
def getIndexOfTooBigElement(blockSize):
for i, bs in enumerate(blockSize):
if bs > maxBlockSize[i]:
return i
return None
def getIndexOfTooSmallElement(blockSize):
for i, bs in enumerate(blockSize):
if bs // 2 <= maxBlockSize[i]:
return i
return None
# Reduce the total number of threads if necessary
while prod(blockSize) > maxThreads:
itemToReduce = blockSize.index(max(blockSize))
for i, bs in enumerate(blockSize):
if bs > maxBlockSize[i]:
itemToReduce = i
blockSize[itemToReduce] //= 2
# Cap individual elements
tooBigElementIndex = getIndexOfTooBigElement(blockSize)
while tooBigElementIndex is not None:
tooSmallElementIndex = getIndexOfTooSmallElement(blockSize)
blockSize[tooSmallElementIndex] *= 2
blockSize[tooBigElementIndex] //= 2
tooBigElementIndex = getIndexOfTooBigElement(blockSize)
return tuple(blockSize)
@staticmethod
def limitBlockSizeByRegisterRestriction(blockSize, requiredRegistersPerThread, device=None):
"""Shrinks the blockSize if there are too many registers used per multiprocessor.
This is not done automatically, since the requiredRegistersPerThread are not known before compilation.
They can be obtained by ``func.num_regs`` from a pycuda function.
:returns smaller blockSize if too many registers are used.
"""
import pycuda.driver as cuda
import pycuda.autoinit
da = cuda.device_attribute
if device is None:
device = cuda.Context.get_device()
availableRegistersPerMP = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
block = blockSize
while True:
numThreads = 1
for t in block:
numThreads *= t
requiredRegistersPerMT = numThreads * requiredRegistersPerThread
if requiredRegistersPerMT <= availableRegistersPerMP:
return block
else:
largestGridEntryIdx = max(range(len(block)), key=lambda e: block[e])
assert block[largestGridEntryIdx] >= 2
block[largestGridEntryIdx] //= 2
@staticmethod
def permuteBlockSizeAccordingToLayout(blockSize, layout):
"""Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
sortedBlockSize = list(sorted(blockSize, reverse=True))
while len(sortedBlockSize) > len(layout):
sortedBlockSize[0] *= sortedBlockSize[-1]
sortedBlockSize = sortedBlockSize[:-1]
result = list(blockSize)
for l, bs in zip(reversed(layout), sortedBlockSize):
result[l] = bs
return tuple(result[:len(layout)])
class LineIndexing(AbstractIndexing):
"""
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
def __init__(self, field, iterationSlice=None):
availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
if field.spatialDimensions > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = availableIndices[:field.spatialDimensions]
fastestCoordinate = field.layout[-1]
coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
self._coordinates = coordinates
self._iterationSlice = normalizeSlice(iterationSlice, field.spatialShape)
self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatialShape]
@property
def coordinates(self):
return [i + offset for i, offset in zip(self._coordinates, _getStartFromSlice(self._iterationSlice))]
def getCallParameters(self, arrShape):
substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
_getEndFromSlice(self._iterationSlice, arrShape))]
widths = sp.Matrix(widths).subs(substitutionDict)
def getShapeOfCudaIdx(cudaIdx):
if cudaIdx not in self._coordinates:
return 1
else:
idx = self._coordinates.index(cudaIdx)
return widths[idx]
return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
def guard(self, kernelContent, arrShape):
return kernelContent
# -------------------------------------- Helper functions --------------------------------------------------------------
def _getStartFromSlice(iterationSlice):
res = []
for sliceComponent in iterationSlice:
if type(sliceComponent) is slice:
res.append(sliceComponent.start if sliceComponent.start is not None else 0)
else:
assert isinstance(sliceComponent, int)
res.append(sliceComponent)
return res
def _getEndFromSlice(iterationSlice, arrShape):
iterSlice = normalizeSlice(iterationSlice, arrShape)
res = []
for sliceComponent in iterSlice:
if type(sliceComponent) is slice:
res.append(sliceComponent.stop)
else:
assert isinstance(sliceComponent, int)
res.append(sliceComponent + 1)
return res
def indexingCreatorFromParams(gpuIndexing, gpuIndexingParams):
if isinstance(gpuIndexing, str):
if gpuIndexing == 'block':
indexingCreator = BlockIndexing
elif gpuIndexing == 'line':
indexingCreator = LineIndexing
else:
raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpuIndexing,))
if gpuIndexingParams:
indexingCreator = partial(indexingCreator, **gpuIndexingParams)
return indexingCreator
else:
return gpuIndexing
from functools import partial
from pystencils.gpucuda.indexing import BlockIndexing
from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo, getCommonShape, \
substituteArrayAccessesWithConstants
from pystencils.astnodes import Block, KernelFunction, SympyAssignment, LoopOverCoordinate
from pystencils.data_types import TypedSymbol, BasicType, StructType
from pystencils import Field
from pystencils.gpucuda.cudajit import makePythonFunction
def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None, indexingCreator=BlockIndexing,
iterationSlice=None, ghostLayers=None):
fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol)
allFields = fieldsRead.union(fieldsWritten)
readOnlyFields = set([f.name for f in fieldsRead - fieldsWritten])
fieldAccesses = set()
for eq in listOfEquations:
fieldAccesses.update(eq.atoms(Field.Access))
commonShape = getCommonShape(allFields)
if iterationSlice is None:
# determine iteration slice from ghost layers
if ghostLayers is None:
# determine required number of ghost layers from field access
requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
ghostLayers = [(requiredGhostLayers, requiredGhostLayers)] * len(commonShape)
iterationSlice = []
if isinstance(ghostLayers, int):
for i in range(len(commonShape)):
iterationSlice.append(slice(ghostLayers, -ghostLayers if ghostLayers > 0 else None))
else:
for i in range(len(commonShape)):
iterationSlice.append(slice(ghostLayers[i][0], -ghostLayers[i][1] if ghostLayers[i][1] > 0 else None))
indexing = indexingCreator(field=list(allFields)[0], iterationSlice=iterationSlice)
block = Block(assignments)
block = indexing.guard(block, commonShape)
ast = KernelFunction(block, functionName=functionName, ghostLayers=ghostLayers)
ast.globalVariables.update(indexing.indexVariables)
coordMapping = indexing.coordinates
basePointerInfo = [['spatialInner0']]
basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [2, 1, 0], f) for f in allFields}
coordMapping = {f.name: coordMapping for f in allFields}
resolveFieldAccesses(ast, readOnlyFields, fieldToFixedCoordinates=coordMapping,
fieldToBasePointerInfo=basePointerInfos)
substituteArrayAccessesWithConstants(ast)
# add the function which determines #blocks and #threads as additional member to KernelFunction node
# this is used by the jit
# If loop counter symbols have been explicitly used in the update equations (e.g. for built in periodicity),
# they are defined here
undefinedLoopCounters = {LoopOverCoordinate.isLoopCounterSymbol(s): s for s in ast.body.undefinedSymbols
if LoopOverCoordinate.isLoopCounterSymbol(s) is not None}
for i, loopCounter in undefinedLoopCounters.items():
ast.body.insertFront(SympyAssignment(loopCounter, indexing.coordinates[i]))
ast.indexing = indexing
ast.compile = partial(makePythonFunction, ast)
return ast
def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel", typeForSymbol=None,
coordinateNames=('x', 'y', 'z'), indexingCreator=BlockIndexing):
fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol)
allFields = fieldsRead.union(fieldsWritten)
readOnlyFields = set([f.name for f in fieldsRead - fieldsWritten])
for indexField in indexFields:
indexField.isIndexField = True
assert indexField.spatialDimensions == 1, "Index fields have to be 1D"
nonIndexFields = [f for f in allFields if f not in indexFields]
spatialCoordinates = {f.spatialDimensions for f in nonIndexFields}
assert len(spatialCoordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
spatialCoordinates = list(spatialCoordinates)[0]
def getCoordinateSymbolAssignment(name):
for indexField in indexFields:
assert isinstance(indexField.dtype, StructType), "Index fields have to have a struct datatype"
dataType = indexField.dtype
if dataType.hasElement(name):
rhs = indexField[0](name)
lhs = TypedSymbol(name, BasicType(dataType.getElementType(name)))
return SympyAssignment(lhs, rhs)
raise ValueError("Index %s not found in any of the passed index fields" % (name,))
coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]]
coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
idxField = list(indexFields)[0]
indexing = indexingCreator(field=idxField, iterationSlice=[slice(None, None, None)] * len(idxField.spatialShape))
functionBody = Block(coordinateSymbolAssignments + assignments)
functionBody = indexing.guard(functionBody, getCommonShape(indexFields))
ast = KernelFunction(functionBody, functionName=functionName)
ast.globalVariables.update(indexing.indexVariables)
coordMapping = indexing.coordinates
basePointerInfo = [['spatialInner0']]
basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [2, 1, 0], f) for f in allFields}
coordMapping = {f.name: coordMapping for f in indexFields}
coordMapping.update({f.name: coordinateTypedSymbols for f in nonIndexFields})
resolveFieldAccesses(ast, readOnlyFields, fieldToFixedCoordinates=coordMapping,
fieldToBasePointerInfo=basePointerInfos)
substituteArrayAccessesWithConstants(ast)
# add the function which determines #blocks and #threads as additional member to KernelFunction node
# this is used by the jit
ast.indexing = indexing
ast.compile = partial(makePythonFunction, ast)
return ast
import sympy as sp
import numpy as np
from pystencils import Field
from pystencils.slicing import normalizeSlice, getPeriodicBoundarySrcDstSlices
from pystencils.gpucuda import makePythonFunction
from pystencils.gpucuda.kernelcreation import createCUDAKernel
def createCopyKernel(domainSize, fromSlice, toSlice, indexDimensions=0, indexDimShape=1, dtype=np.float64):
"""Copies a rectangular part of an array to another non-overlapping part"""
if indexDimensions not in (0, 1):
raise NotImplementedError("Works only for one or zero index coordinates")
f = Field.createGeneric("pdfs", len(domainSize), indexDimensions=indexDimensions, dtype=dtype)
normalizedFromSlice = normalizeSlice(fromSlice, f.spatialShape)
normalizedToSlice = normalizeSlice(toSlice, f.spatialShape)
offset = [s1.start - s2.start for s1, s2 in zip(normalizedFromSlice, normalizedToSlice)]
assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalizedFromSlice, normalizedToSlice)], "Slices have to have same size"
updateEqs = []
for i in range(indexDimShape):
eq = sp.Eq(f(i), f[tuple(offset)](i))
updateEqs.append(eq)
ast = createCUDAKernel(updateEqs, iterationSlice=toSlice)
return makePythonFunction(ast)
def getPeriodicBoundaryFunctor(stencil, domainSize, indexDimensions=0, indexDimShape=1, ghostLayers=1,
thickness=None, dtype=float):
srcDstSliceTuples = getPeriodicBoundarySrcDstSlices(stencil, ghostLayers, thickness)
kernels = []
indexDimensions = indexDimensions
for srcSlice, dstSlice in srcDstSliceTuples:
kernels.append(createCopyKernel(domainSize, srcSlice, dstSlice, indexDimensions, indexDimShape, dtype))
def functor(pdfs, **kwargs):
for kernel in kernels:
kernel(pdfs=pdfs)
return functor
from .kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
from jinja2 import Template
from pystencils.cpu import generateC
from pystencils.sympyextensions import prod
from pystencils.data_types import getBaseType
benchmarkTemplate = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(double *);
extern int var_false;
{{kernelCode}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
likwid_markerThreadInit();
{%- endif %}
{%- for fieldName, dataType, size in fields %}
// Initialization {{fieldName}}
double * {{fieldName}} = aligned_malloc(sizeof({{dataType}}) * {{size}}, 32);
for (int i = 0; i < {{size}}; ++i)
{{fieldName}}[i] = 0.23;
if(var_false)
dummy({{fieldName}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
int repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{callArgumentList}});
// Dummy calls
{%- for fieldName, dataType, size in fields %}
if(var_false) dummy({{fieldName}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
""")
def generateBenchmark(ast, likwid=False):
accessedFields = {f.name: f for f in ast.fieldsAccessed}
constants = []
fields = []
callParameters = []
for p in ast.parameters:
if not p.isFieldArgument:
constants.append((p.name, str(p.dtype)))
callParameters.append(p.name)
else:
assert p.isFieldPtrArgument, "Benchmark implemented only for kernels with fixed loop size"
field = accessedFields[p.fieldName]
dtype = str(getBaseType(p.dtype))
fields.append((p.fieldName, dtype, prod(field.shape)))
callParameters.append(p.fieldName)
args = {
'likwid': likwid,
'kernelCode': generateC(ast),
'kernelName': ast.functionName,
'fields': fields,
'constants': constants,
'callArgumentList': ",".join(callParameters),
}
return benchmarkTemplate.render(**args)
from tempfile import TemporaryDirectory
import sympy as sp
import os
from collections import defaultdict
import subprocess
import kerncraft
import kerncraft.kernel
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, Benchmark
from kerncraft.iaca import iaca_analyse_instrumented_binary, iaca_instrumentation
from pystencils.kerncraft_coupling.generate_benchmark import generateBenchmark
from pystencils.astnodes import LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess
from pystencils.field import getLayoutFromStrides
from pystencils.sympyextensions import countNumberOfOperationsInAst
from pystencils.utils import DotDict
class PyStencilsKerncraftKernel(kerncraft.kernel.Kernel):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast, machine=None):
super(PyStencilsKerncraftKernel, self).__init__(machine)
self.ast = ast
self.temporaryDir = TemporaryDirectory()
# Loops
innerLoops = [l for l in ast.atoms(LoopOverCoordinate) if l.isInnermostLoop]
if len(innerLoops) == 0:
raise ValueError("No loop found in pystencils AST")
elif len(innerLoops) > 1:
raise ValueError("pystencils AST contains multiple inner loops - only one can be analyzed")
else:
innerLoop = innerLoops[0]
self._loop_stack = []
curNode = innerLoop
while curNode is not None:
if isinstance(curNode, LoopOverCoordinate):
loopCounterSym = curNode.loopCounterSymbol
loopInfo = (loopCounterSym.name, curNode.start, curNode.stop, curNode.step)
self._loop_stack.append(loopInfo)
curNode = curNode.parent
self._loop_stack = list(reversed(self._loop_stack))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
reads, writes = searchResolvedFieldAccessesInAst(innerLoop)
for accesses, targetDict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [sp.Symbol(LoopOverCoordinate.getLoopCounterName(i), positive=True, integer=True) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idxCoordinateValues)
layout = getLayoutFromStrides(fa.field.strides)
permutedCoord = [coord[i] for i in layout]
targetDict[fa.field.name].append(permutedCoord)
# Variables (arrays)
fieldsAccessed = ast.fieldsAccessed
for field in fieldsAccessed:
layout = getLayoutFromStrides(field.strides)
permutedShape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permutedShape))
for param in ast.parameters:
if not param.isFieldArgument:
self.set_variable(param.name, str(param.dtype), None)
self.sources[param.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operationCount = countNumberOfOperationsInAst(innerLoop)
self._flops = {
'+': operationCount['adds'],
'*': operationCount['muls'],
'/': operationCount['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
def iaca_analysis(self, micro_architecture, asm_block='auto',
pointer_increment='auto_with_manual_fallback', verbose=False):
compiler, compiler_args = self._machine.get_compiler()
if '-std=c99' not in compiler_args:
compiler_args += ['-std=c99']
headerPath = kerncraft.get_header_path()
compilerCmd = [compiler] + compiler_args + ['-I' + headerPath]
srcFile = os.path.join(self.temporaryDir.name, "source.c")
asmFile = os.path.join(self.temporaryDir.name, "source.s")
iacaAsmFile = os.path.join(self.temporaryDir.name, "source.iaca.s")
dummySrcFile = os.path.join(headerPath, "dummy.c")
dummyAsmFile = os.path.join(self.temporaryDir.name, "dummy.s")
binaryFile = os.path.join(self.temporaryDir.name, "binary")
# write source code to file
with open(srcFile, 'w') as f:
f.write(generateBenchmark(self.ast, likwid=False))
# compile to asm files
subprocess.check_output(compilerCmd + [srcFile, '-S', '-o', asmFile])
subprocess.check_output(compilerCmd + [dummySrcFile, '-S', '-o', dummyAsmFile])
with open(asmFile) as read, open(iacaAsmFile, 'w') as write:
instrumentedAsmBlock = iaca_instrumentation(read, write)
# assemble asm files to executable
subprocess.check_output(compilerCmd + [iacaAsmFile, dummyAsmFile, '-o', binaryFile])
result = iaca_analyse_instrumented_binary(binaryFile, micro_architecture)
return result, instrumentedAsmBlock
def build(self, lflags=None, verbose=False):
compiler, compiler_args = self._machine.get_compiler()
if '-std=c99' not in compiler_args:
compiler_args.append('-std=c99')
headerPath = kerncraft.get_header_path()
cmd = [compiler] + compiler_args + [
'-I' + os.path.join(self.LIKWID_BASE, 'include'),
'-L' + os.path.join(self.LIKWID_BASE, 'lib'),
'-I' + headerPath,
'-Wl,-rpath=' + os.path.join(self.LIKWID_BASE, 'lib'),
]
dummySrcFile = os.path.join(headerPath, 'dummy.c')
srcFile = os.path.join(self.temporaryDir.name, "source_likwid.c")
binFile = os.path.join(self.temporaryDir.name, "benchmark")
with open(srcFile, 'w') as f:
f.write(generateBenchmark(self.ast, likwid=True))
subprocess.check_output(cmd + [srcFile, dummySrcFile, '-pthread', '-llikwid', '-o', binFile])
return binFile
class KerncraftParameters(DotDict):
def __init__(self):
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
# ------------------------------------------- Helper functions ---------------------------------------------------------
def searchResolvedFieldAccessesInAst(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
readAccesses = set()
writeAccesses = set()
visit(ast, readAccesses, writeAccesses)
return readAccesses, writeAccesses
\ No newline at end of file
from pystencils.equationcollection import EquationCollection
from pystencils.gpucuda.indexing import indexingCreatorFromParams
def createKernel(equations, target='cpu', dataType="double", iterationSlice=None, ghostLayers=None,
cpuOpenMP=True, cpuVectorizeInfo=None,
gpuIndexing='block', gpuIndexingParams={}):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
:param equations: either be a plain list of equations or a EquationCollection object
:param target: 'cpu', 'llvm' or 'gpu'
:param dataType: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
to type
:param iterationSlice: rectangular subset to iterate over, if not specified the complete non-ghost layer part of the
field is iterated over
:param ghostLayers: if left to default, the number of necessary ghost layers is determined automatically
a single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs [(xLowerGl, xUpperGl), .... ]
CPU specific Parameters:
:param cpuOpenMP: True or number of threads for OpenMP parallelization, False for no OpenMP
:param cpuVectorizeInfo: pair of instruction set name ('sse, 'avx', 'avx512') and data type ('float', 'double')
GPU specific Parameters
:param gpuIndexing: either 'block' or 'line' , or custom indexing class (see gpucuda/indexing.py)
:param gpuIndexingParams: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify {'blockSize': (20, 20, 10) }
:return: abstract syntax tree object, that can either be printed as source code or can be compiled with
through its compile() function
"""
# ---- Normalizing parameters
splitGroups = ()
if isinstance(equations, EquationCollection):
if 'splitGroups' in equations.simplificationHints:
splitGroups = equations.simplificationHints['splitGroups']
equations = equations.allEquations
# ---- Creating ast
if target == 'cpu':
from pystencils.cpu import createKernel
from pystencils.cpu import addOpenMP
ast = createKernel(equations, typeForSymbol=dataType, splitGroups=splitGroups,
iterationSlice=iterationSlice, ghostLayers=ghostLayers)
if cpuOpenMP:
addOpenMP(ast, numThreads=cpuOpenMP)
if cpuVectorizeInfo:
import pystencils.backends.simd_instruction_sets as vec
from pystencils.vectorization import vectorize
vecParams = cpuVectorizeInfo
vec.selectedInstructionSet = vec.x86VectorInstructionSet(instructionSet=vecParams[0], dataType=vecParams[1])
vectorize(ast)
return ast
elif target == 'llvm':
from pystencils.llvm import createKernel
ast = createKernel(equations, typeForSymbol=dataType, splitGroups=splitGroups,
iterationSlice=iterationSlice, ghostLayers=ghostLayers)
return ast
elif target == 'gpu':
from pystencils.gpucuda import createCUDAKernel
ast = createCUDAKernel(equations, typeForSymbol=dataType,
indexingCreator=indexingCreatorFromParams(gpuIndexing, gpuIndexingParams),
iterationSlice=iterationSlice, ghostLayers=ghostLayers)
return ast
else:
raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
def createIndexedKernel(equations, indexFields, target='cpu', dataType="double", coordinateNames=('x', 'y', 'z'),
cpuOpenMP=True,
gpuIndexing='block', gpuIndexingParams={}):
"""
Similar to :func:`createKernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
The coordinates are stored in a separated indexField, which is a one dimensional array with struct data type.
This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
'coordinateNames' parameter. The struct can have also other fields that can be read and written in the kernel, for
example boundary parameters.
indexFields: list of index fields, i.e. 1D fields with struct data type
coordinateNames: name of the coordinate fields in the struct data type
"""
if isinstance(equations, EquationCollection):
equations = equations.allEquations
if target == 'cpu':
from pystencils.cpu import createIndexedKernel
from pystencils.cpu import addOpenMP
ast = createIndexedKernel(equations, indexField=indexFields, typeForSymbol=dataType,
coordinateNames=coordinateNames)
if cpuOpenMP:
addOpenMP(ast, numThreads=cpuOpenMP)
return ast
elif target == 'llvm':
raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
elif target == 'gpu':
from pystencils.gpucuda import createdIndexedCUDAKernel
ast = createdIndexedCUDAKernel(equations, indexFields, typeForSymbol=dataType, coordinateNames=coordinateNames,
indexingCreator=indexingCreatorFromParams(gpuIndexing, gpuIndexingParams))
return ast
else:
raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
from .kernelcreation import createKernel, createIndexedKernel
from .llvmjit import compileLLVM, generate_and_jit, Jit, makePythonFunction
from .llvm import generateLLVM
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)