Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 397 additions and 43063 deletions
This diff is collapsed.
This diff is collapsed.
......@@ -5,6 +5,7 @@ API Reference
:maxdepth: 3
kernel_compile_and_call.rst
enums.rst
simplifications.rst
datahandling.rst
configuration.rst
......
************
Enumerations
************
.. automodule:: pystencils.enums
:members:
......@@ -8,9 +8,14 @@ Creating kernels
.. autofunction:: pystencils.create_kernel
.. autofunction:: pystencils.create_indexed_kernel
.. autoclass:: pystencils.CreateKernelConfig
:members:
.. autofunction:: pystencils.create_staggered_kernel
.. autofunction:: pystencils.kernelcreation.create_domain_kernel
.. autofunction:: pystencils.kernelcreation.create_indexed_kernel
.. autofunction:: pystencils.kernelcreation.create_staggered_kernel
Code printing
......@@ -22,11 +27,11 @@ Code printing
GPU Indexing
-------------
.. autoclass:: pystencils.gpucuda.AbstractIndexing
.. autoclass:: pystencils.gpu.AbstractIndexing
:members:
.. autoclass:: pystencils.gpucuda.BlockIndexing
.. autoclass:: pystencils.gpu.BlockIndexing
:members:
.. autoclass:: pystencils.gpucuda.LineIndexing
.. autoclass:: pystencils.gpu.LineIndexing
:members:
......@@ -10,13 +10,27 @@ AssignmentCollection
:members:
SimplificationStrategy
======================
.. autoclass:: pystencils.simp.SimplificationStrategy
:members:
Simplifications
===============
.. automodule:: pystencils.simp
:members:
.. automodule:: pystencils.simp.simplifications
:members:
Subexpression insertion
=======================
The subexpression insertions have the goal to insert subexpressions which will not reduce the number of FLOPs.
For example a constant value kept as subexpression will lead to a new variable in the code which will occupy
a register slot. On the other side a single variable could just be inserted in all assignments.
.. automodule:: pystencils.simp.subexpression_insertion
:members:
......
import subprocess
from distutils.version import StrictVersion
def version_number_from_git(tag_prefix='release/', sha_length=10, version_format="{version}.dev{commits}+{sha}"):
def get_released_versions():
tags = sorted(subprocess.getoutput('git tag').split('\n'))
versions = [t[len(tag_prefix):] for t in tags if t.startswith(tag_prefix)]
return versions
def tag_from_version(v):
return tag_prefix + v
def increment_version(v):
parsed_version = [int(i) for i in v.split('.')]
parsed_version[-1] += 1
return '.'.join(str(i) for i in parsed_version)
version_strings = get_released_versions()
version_strings.sort(key=StrictVersion)
latest_release = version_strings[-1]
commits_since_tag = subprocess.getoutput('git rev-list {}..HEAD --count'.format(tag_from_version(latest_release)))
sha = subprocess.getoutput('git rev-parse HEAD')[:sha_length]
is_dirty = len(subprocess.getoutput("git status --untracked-files=no -s")) > 0
if int(commits_since_tag) == 0:
version_string = latest_release
else:
next_version = increment_version(latest_release)
version_string = version_format.format(version=next_version, commits=commits_since_tag, sha=sha)
if is_dirty:
version_string += ".dirty"
return version_string
#!/usr/bin/env bash
# Checks run before every push
# has to be copied to .git/hooks
echo "Running pre-push hook"
echo "Running flake8 check"
flake8 --append-config=.flake8 pystencils
# $? stores exit value of the last command
if [ $? -ne 0 ]; then
echo "flake8 failed"
exit 1
fi
python3 setup.py quicktest
# $? stores exit value of the last command
if [ $? -ne 0 ]; then
echo "quicktest failed"
exit 1
fi
[project]
name = "pystencils"
description = "Speeding up stencil computations on CPUs and GPUs"
dynamic = ["version"]
readme = "README.md"
authors = [
{ name = "Martin Bauer" },
{ name = "Jan Hönig " },
{ name = "Markus Holzer" },
{ name = "Frederik Hennig" },
{ email = "cs10-codegen@fau.de" },
]
license = { file = "COPYING.txt" }
requires-python = ">=3.10"
dependencies = ["sympy>=1.9,<=1.12.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml", "fasteners"]
classifiers = [
"Development Status :: 4 - Beta",
"Framework :: Jupyter",
"Topic :: Software Development :: Code Generators",
"Topic :: Scientific/Engineering :: Physics",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
]
[project.urls]
"Bug Tracker" = "https://i10git.cs.fau.de/pycodegen/pystencils/-/issues"
"Documentation" = "https://pycodegen.pages.i10git.cs.fau.de/pystencils/"
"Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils"
[project.optional-dependencies]
gpu = ['cupy']
alltrafos = ['islpy', 'py-cpuinfo']
bench_db = ['blitzdb', 'pymongo', 'pandas']
interactive = [
'matplotlib',
'ipy_table',
'imageio',
'jupyter',
'pyevtk',
'rich',
'graphviz',
]
use_cython = [
'Cython'
]
doc = [
'sphinx',
'sphinx_rtd_theme',
'nbsphinx',
'sphinxcontrib-bibtex',
'sphinx_autodoc_typehints',
'pandoc',
]
tests = [
'pytest',
'pytest-cov',
'pytest-html',
'ansi2html',
'pytest-xdist',
'flake8',
'nbformat',
'nbconvert',
'ipython',
'matplotlib',
'py-cpuinfo',
'randomgen>=1.18',
]
[build-system]
requires = [
"setuptools>=61",
"versioneer[toml]>=0.29",
# 'Cython'
]
build-backend = "setuptools.build_meta"
[tool.setuptools.package-data]
pystencils = [
"include/*.h",
"boundaries/createindexlistcython.pyx"
]
[tool.setuptools.packages.find]
where = ["src"]
include = ["pystencils", "pystencils.*"]
namespaces = false
[tool.versioneer]
# See the docstring in versioneer.py for instructions. Note that you must
# re-run 'versioneer.py setup' after changing this section, and commit the
# resulting files.
VCS = "git"
style = "pep440"
versionfile_source = "src/pystencils/_version.py"
versionfile_build = "pystencils/_version.py"
tag_prefix = "release/"
parentdir_prefix = "pystencils-"
"""
Provides tools for generation of auto-differentiable operations.
See https://github.com/theHamsta/pystencils_autodiff
Installation:
.. code-block:: bash
pip install pystencils-autodiff
"""
raise NotImplementedError('pystencils-autodiff is not installed. Run `pip install pystencils-autodiff`')
from os.path import dirname, join
from pystencils.astnodes import Node
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
lines = f.readlines()
CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_cuda(astnode: Node, signature_only: bool = False) -> str:
"""Prints an abstract syntax tree node as CUDA code.
Args:
astnode: KernelFunction node to generate code for
signature_only: if True only the signature is printed
Returns:
C-like code for the ast node and its descendants
"""
return generate_c(astnode, signature_only, dialect='cuda')
class CudaBackend(CBackend):
def __init__(self, sympy_printer=None,
signature_only=False):
if not sympy_printer:
sympy_printer = CudaSympyPrinter()
super().__init__(sympy_printer, signature_only, dialect='cuda')
def _print_SharedMemoryAllocation(self, node):
code = "__shared__ {dtype} {name}[{num_elements}];"
return code.format(dtype=node.symbol.dtype,
name=self.sympy_printer.doprint(node.symbol.name),
num_elements='*'.join([str(s) for s in node.shared_mem.shape]))
@staticmethod
def _print_ThreadBlockSynchronization(node):
code = "__synchtreads();"
return code
def _print_TextureDeclaration(self, node):
if node.texture.field.dtype.numpy_dtype.itemsize > 4:
code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
else:
code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
return code
def _print_SkipIteration(self, _):
return "return;"
class CudaSympyPrinter(CustomSympyPrinter):
language = "CUDA"
def __init__(self):
super(CudaSympyPrinter, self).__init__()
self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
def _print_InterpolatorAccess(self, node):
dtype = node.interpolator.field.dtype.numpy_dtype
if type(node) == DiffInterpolatorAccess:
# cubicTex3D_1st_derivative_x(texture tex, float3 coord)
template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)" # noqa
elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
template = "cubicTex%iDSimple(%s, %s)"
else:
if dtype.itemsize > 4:
# Use PyCuda hack!
# https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
template = "fp_tex%iD(%s, %s)"
else:
template = "tex%iD(%s, %s)"
code = template % (
node.interpolator.field.spatial_dimensions,
str(node.interpolator),
# + 0.5 comes from Nvidia's staggered indexing
', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
)
return code
def _print_Function(self, expr):
if isinstance(expr, fast_division):
return "__fdividef(%s, %s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_sqrt):
return "__fsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_inv_sqrt):
return "__frsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args)
return super()._print_Function(expr)
__prof_trigger
printf
__syncthreads
__syncthreads_count
__syncthreads_and
__syncthreads_or
__syncwarp
__threadfence
__threadfence_block
__threadfence_system
atomicAdd
atomicSub
atomicExch
atomicMin
atomicMax
atomicInc
atomicDec
atomicAnd
atomicOr
atomicXor
atomicCAS
__all_sync
__any_sync
__ballot_sync
__active_mask
__shfl_sync
__shfl_up_sync
__shfl_down_sync
__shfl_xor_sync
__match_any_sync
__match_all_sync
__isGlobal
__isShared
__isConstant
__isLocal
tex1Dfetch
tex1D
tex2D
tex3D
sqrtf
rsqrtf
cbrtf
rcbrtf
hypotf
rhypotf
norm3df
rnorm3df
norm4df
rnorm4df
normf
rnormf
expf
exp2f
exp10f
expm1f
logf
log2f
log10f
log1pf
sinf
cosf
tanf
sincosf
sinpif
cospif
sincospif
asinf
acosf
atanf
atan2f
sinhf
coshf
tanhf
asinhf
acoshf
atanhf
powf
erff
erfcf
erfinvf
erfcinvf
erfcxf
normcdff
normcdfinvf
lgammaf
tgammaf
fmaf
frexpf
ldexpf
scalbnf
scalblnf
logbf
ilogbf
j0f
j1f
jnf
y0f
y1f
ynf
cyl_bessel_i0f
cyl_bessel_i1f
fmodf
remainderf
remquof
modff
fdimf
truncf
roundf
rintf
nearbyintf
ceilf
floorf
lrintf
lroundf
llrintf
llroundf
sqrt
rsqrt
cbrt
rcbrt
hypot
rhypot
norm3d
rnorm3d
norm4d
rnorm4d
norm
rnorm
exp
exp2
exp10
expm1
log
log2
log10
log1p
sin
cos
tan
sincos
sinpi
cospi
sincospi
asin
acos
atan
atan2
sinh
cosh
tanh
asinh
acosh
atanh
pow
erf
erfc
erfinv
erfcinv
erfcx
normcdf
normcdfinv
lgamma
tgamma
fma
frexp
ldexp
scalbn
scalbln
logb
ilogb
j0
j1
jn
y0
y1
yn
cyl_bessel_i0
cyl_bessel_i1
fmod
remainder
remquo
mod
fdim
trunc
round
rint
nearbyint
ceil
floor
lrint
lround
llrint
llround
__fdividef
__sinf
__cosf
__tanf
__sincosf
__logf
__log2f
__log10f
__expf
__exp10f
__powf
__fadd_rn
__fsub_rn
__fmul_rn
__fmaf_rn
__frcp_rn
__fsqrt_rn
__frsqrt_rn
__fdiv_rn
__fadd_rz
__fsub_rz
__fmul_rz
__fmaf_rz
__frcp_rz
__fsqrt_rz
__frsqrt_rz
__fdiv_rz
__fadd_ru
__fsub_ru
__fmul_ru
__fmaf_ru
__frcp_ru
__fsqrt_ru
__frsqrt_ru
__fdiv_ru
__fadd_rd
__fsub_rd
__fmul_rd
__fmaf_rd
__frcp_rd
__fsqrt_rd
__frsqrt_rd
__fdiv_rd
__fdividef
__expf
__exp10f
__logf
__log2f
__log10f
__sinf
__cosf
__sincosf
__tanf
__powf
__dadd_rn
__dsub_rn
__dmul_rn
__fma_rn
__ddiv_rn
__drcp_rn
__dsqrt_rn
__dadd_rz
__dsub_rz
__dmul_rz
__fma_rz
__ddiv_rz
__drcp_rz
__dsqrt_rz
__dadd_ru
__dsub_ru
__dmul_ru
__fma_ru
__ddiv_ru
__drcp_ru
__dsqrt_ru
__dadd_rd
__dsub_rd
__dmul_rd
__fma_rd
__ddiv_rd
__drcp_rd
__dsqrt_rd
acos
acosh
acospi
asin
asinh
asinpi
atan
atan2
atanh
atanpi
atan2pi
cbrt
ceil
copysign
cos
cosh
cospi
erfc
erf
exp
exp2
exp10
expm1
fabs
fdim
floor
fma
fmax
fmax
fmin45
fmin
fmod
fract
frexp
hypot
ilogb
ldexp
lgamma
lgamma_r
log
log2
log10
log1p
logb
mad
maxmag
minmag
modf
nextafter
pow
pown
powr
remquo
intn
remquo
rint
rootn
rootn
round
rsqrt
sin
sincos
sinh
sinpi
sqrt
tan
tanh
tanpi
tgamma
trunc
half_cos
half_divide
half_exp
half_exp2
half_exp10
half_log
half_log2
half_log10
half_powr
half_recip
half_rsqrt
half_sin
half_sqrt
half_tan
native_cos
native_divide
native_exp
native_exp2
native_exp10
native_log
native_log2
native_log10
native_powr
native_recip
native_rsqrt
native_sin
native_sqrt
native_tan
from os.path import dirname, join
import pystencils.data_types
from pystencils.astnodes import Node
from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
from pystencils.backends.cuda_backend import CudaBackend, CudaSympyPrinter
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
with open(join(dirname(__file__), 'opencl1.1_known_functions.txt')) as f:
lines = f.readlines()
OPENCL_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_opencl(astnode: Node, signature_only: bool = False) -> str:
"""Prints an abstract syntax tree node (made for target 'gpu') as OpenCL code.
Args:
astnode: KernelFunction node to generate code for
signature_only: if True only the signature is printed
Returns:
C-like code for the ast node and its descendants
"""
return generate_c(astnode, signature_only, dialect='opencl')
class OpenClBackend(CudaBackend):
def __init__(self,
sympy_printer=None,
signature_only=False):
if not sympy_printer:
sympy_printer = OpenClSympyPrinter()
super().__init__(sympy_printer, signature_only)
self._dialect = 'opencl'
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadBlockSynchronization(self, node):
raise NotImplementedError()
def _print_TextureDeclaration(self, node):
raise NotImplementedError()
class OpenClSympyPrinter(CudaSympyPrinter):
language = "OpenCL"
DIMENSION_MAPPING = {
'x': '0',
'y': '1',
'z': '2'
}
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'get_group_id',
'threadIdx': 'get_local_id',
'blockDim': 'get_local_size',
'gridDim': 'get_global_size'
}
def __init__(self):
CustomSympyPrinter.__init__(self)
self.known_functions = OPENCL_KNOWN_FUNCTIONS
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
dimension = self.DIMENSION_MAPPING[dimension]
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
return f"(int) {function_name}({dimension})"
def _print_TextureAccess(self, node):
raise NotImplementedError()
# For math functions, OpenCL is more similar to the C++ printer CustomSympyPrinter
# since built-in math functions are generic.
# In CUDA, you have to differentiate between `sin` and `sinf`
try:
_print_math_func = CustomSympyPrinter._print_math_func
except AttributeError:
pass
_print_Pow = CustomSympyPrinter._print_Pow
def _print_Function(self, expr):
if isinstance(expr, fast_division):
return "native_divide(%s, %s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_sqrt):
return "native_sqrt(%s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_inv_sqrt):
return "native_rsqrt(%s)" % tuple(self._print(a) for a in expr.args)
return CustomSympyPrinter._print_Function(self, expr)
This diff is collapsed.
import os
from collections.abc import Hashable
from functools import partial
from itertools import chain
try:
from functools import lru_cache as memorycache
except ImportError:
from backports.functools_lru_cache import lru_cache as memorycache
try:
from joblib import Memory
from appdirs import user_cache_dir
if 'PYSTENCILS_CACHE_DIR' in os.environ:
cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
else:
cache_dir = user_cache_dir('pystencils')
disk_cache = Memory(cache_dir, verbose=False).cache
disk_cache_no_fallback = disk_cache
except ImportError:
# fallback to in-memory caching if joblib is not available
disk_cache = memorycache(maxsize=64)
def disk_cache_no_fallback(o):
return o
def _wrapper(wrapped_func, cached_func, *args, **kwargs):
if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
return cached_func(*args, **kwargs)
else:
return wrapped_func(*args, **kwargs)
def memorycache_if_hashable(maxsize=128, typed=False):
def wrapper(func):
return partial(_wrapper, func, memorycache(maxsize, typed)(func))
return wrapper
# Disable memory cache:
# disk_cache = lambda o: o
# disk_cache_no_fallback = lambda o: o
This diff is collapsed.
try:
import pycuda.gpuarray as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyCudaArrayHandler:
def __init__(self):
import pycuda.autoinit # NOQA
def zeros(self, shape, dtype=np.float64, order='C'):
return gpuarray.zeros(shape, dtype, order)
def ones(self, shape, dtype, order='C'):
return gpuarray.ones(shape, dtype, order)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
return self.to_gpu(cpu_array)
else:
return gpuarray.empty(shape, dtype)
def to_gpu(self, array):
return gpuarray.to_gpu(array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array)
def download(self, gpuarray, numpy_array):
gpuarray.get(numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.to_gpu(cpu_array)
from_numpy = to_gpu