Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 2321 deletions
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
USE_FAST_MATH = True
_global_cl_ctx = None
_global_cl_queue = None
def get_global_cl_queue():
return _global_cl_queue
def get_global_cl_ctx():
return _global_cl_ctx
def init_globally(device_index=0):
import pyopencl as cl
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = cl.create_some_context(device_index)
_global_cl_queue = cl.CommandQueue(_global_cl_ctx)
def init_globally_with_context(opencl_ctx, opencl_queue):
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = opencl_ctx
_global_cl_queue = opencl_queue
def clear_global_ctx():
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = None
_global_cl_queue = None
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
if not opencl_ctx:
opencl_ctx = _global_cl_ctx
if not opencl_queue:
opencl_queue = _global_cl_queue
assert opencl_ctx, "No valid OpenCL context!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
assert opencl_queue, "No valid OpenCL queue!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
if argument_dict is None:
argument_dict = {}
# check if double precision is supported and required
if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
for param in kernel_function_node.get_parameters():
if param.symbol.dtype.base_type:
if param.symbol.dtype.base_type.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
else:
if param.symbol.dtype.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations")
options.append("-cl-mad-enable")
options.append("-cl-fast-relaxed-math")
options.append("-cl-finite-math-only")
options.append("-I")
options.append(get_pystencils_include_path())
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
assert not any(isinstance(a, np.ndarray)
for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!'
assert not any('pycuda' in str(type(a))
for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!'
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
return wrapper
import numpy as np
import sympy as sp
from pystencils import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
def _get_rng_template(name, data_type, num_vars):
if data_type is np.float32:
c_type = "float"
elif data_type is np.float64:
c_type = "double"
template = "\n"
for i in range(num_vars):
template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars)))
return template
def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
for i in range(dim)] + [0] * (3 - dim) + list(keys)
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return template.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
class RNGBase(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
if keys is None:
keys = (0,) * self._num_keys
if len(keys) != self._num_keys:
raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
if len(offsets) != 3:
raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self._offsets = offsets
self.headers = ['"{}_rand.h"'.format(self._name)]
self.keys = tuple(keys)
self._args = sp.sympify((dim, time_step, keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def fast_subs(self, _):
return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def get_code(self, dialect, vector_instruction_set):
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set,
self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
def __repr__(self):
return (", ".join(['{}'] * self._num_vars) + " \\leftarrow {}RNG").format(*self.result_symbols,
self._name.capitalize())
class PhiloxTwoDoubles(RNGBase):
_name = "philox"
_data_type = np.float64
_num_vars = 2
_num_keys = 2
class PhiloxFourFloats(RNGBase):
_name = "philox"
_data_type = np.float32
_num_vars = 4
_num_keys = 2
class AESNITwoDoubles(RNGBase):
_name = "aesni"
_data_type = np.float64
_num_vars = 2
_num_keys = 4
class AESNIFourFloats(RNGBase):
_name = "aesni"
_data_type = np.float32
_num_vars = 4
_num_keys = 4
def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
counter = 0
while True:
node = rng_node(*args, keys=(counter, seed), **kwargs)
inserted = False
for symbol in node.result_symbols:
if not inserted:
assignment_list.insert(0, node)
inserted = True
yield symbol
counter += 1
# Disable gmpy backend until this bug is resolved if joblib serialize
# See https://github.com/sympy/sympy/pull/13530
import os
import warnings
os.environ['MPMATH_NOGMPY'] = '1'
try:
import mpmath.libmp
# In case the user has imported sympy first, then pystencils
if mpmath.libmp.BACKEND == 'gmpy':
warnings.warn("You are using the gmpy backend. You might encounter an error 'argument is not an mpz sympy'. "
"This is due to a known bug in sympy/gmpy library. "
"To prevent this, import pystencils first then sympy or set the environment variable "
"MPMATH_NOGMPY=1")
except ImportError:
pass
__all__ = []
from sympy.abc import a, b, c, d, e, f
import pystencils
from pystencils.data_types import cast_func, create_type
def test_type_interference():
x = pystencils.fields('x: float32[3d]')
assignments = pystencils.AssignmentCollection({
a: cast_func(10, create_type('float64')),
b: cast_func(10, create_type('uint16')),
e: 11,
c: b,
f: c + b,
d: c + b + x.center + e,
x.center: c + b + x.center
})
ast = pystencils.create_kernel(assignments)
code = str(pystencils.show_code(ast))
print(code)
assert 'double a' in code
assert 'uint16_t b' in code
assert 'uint16_t f' in code
assert 'int64_t e' in code
# FIXME
# FIXME performance counters might be wrong. This will only affect the Benchmark model
# FIXME bandwidth measurements need validation
# FIXME
kerncraft version: 0.7.2
model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
model type: Intel Core Skylake SP
sockets: 2
cores per socket: 4
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 4
clock: 3.6 GHz
FLOPs per cycle:
SP:
total: 64
FMA: 64
ADD: 32
MUL: 32
DP:
total: 32
FMA: 32
ADD: 16
MUL: 16
micro-architecture: SKX
compiler:
!!omap
- icc: -O3 -fno-alias -xCORE-AVX512
- clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
- gcc: -O3 -march=skylake-avx512
cacheline size: 64 B
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
memory hierarchy:
- level: L1
performance counter metrics:
accesses: MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L2_TRANS_L1D_WB:PMC[0-3]
cache per group:
sets: 64
ways: 8
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: L2
store_to: L2
size per group: 32.00 kB
groups: 8
cores per group: 1
threads per group: 2
- level: L2
non-overlap upstream throughput: [64 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 1024
ways: 16
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: null # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
victims_to: L3 # all victims, modified or not are passed onto L3
store_to: L3
size per group: 1.00 MB
groups: 8
cores per group: 1
threads per group: 2
- level: L3
non-overlap upstream throughput: [16 B/cy, 'full-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
# FIXME not all misses in L2 lead to loads from L3, only the hits do
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 16896
# TODO is actuall something else, but necessary to get to 16.5 MB
ways: 16
# TODO is actually 11, but pycachesim only supports powers of two
cl_size: 64
replacement_policy: 'LRU'
write_allocate: False
write_back: True
size per group: 16.50 MB
groups: 2
cores per group: 4
threads per group: 8
- level: MEM
cores per group: 4
threads per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
penalty cycles per read stream: 0
size per group:
benchmarks:
kernels:
load:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 0
bytes: 0.00 B
FLOPs per iteration: 0
copy:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
update:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
triad:
read streams:
streams: 3
bytes: 24.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
daxpy:
read streams:
streams: 2
bytes: 16.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
measurements:
L1:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 42.98 GB/s
- 85.08 GB/s
- 127.45 GB/s
- 169.92 GB/s
copy:
- 56.07 GB/s
- 111.50 GB/s
- 164.90 GB/s
- 221.50 GB/s
update:
- 56.54 GB/s
- 112.25 GB/s
- 168.50 GB/s
- 224.75 GB/s
triad:
- 45.90 GB/s
- 89.81 GB/s
- 127.29 GB/s
- 169.57 GB/s
daxpy:
- 36.62 GB/s
- 71.30 GB/s
- 103.52 GB/s
- 135.26 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 10.56 kB
- 10.56 kB
- 10.56 kB
- 10.56 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 49.61 GB/s
- 98.80 GB/s
- 147.98 GB/s
- 198.22 GB/s
copy:
- 55.98 GB/s
- 111.56 GB/s
- 167.08 GB/s
- 220.42 GB/s
update:
- 56.53 GB/s
- 112.72 GB/s
- 168.95 GB/s
- 225.31 GB/s
triad:
- 54.01 GB/s
- 104.58 GB/s
- 153.02 GB/s
- 200.93 GB/s
daxpy:
- 41.11 GB/s
- 80.28 GB/s
- 115.71 GB/s
- 152.81 GB/s
L2:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 27.15 GB/s
- 54.09 GB/s
- 80.61 GB/s
- 106.41 GB/s
copy:
- 43.53 GB/s
- 90.07 GB/s
- 127.73 GB/s
- 171.81 GB/s
update:
- 50.38 GB/s
- 98.47 GB/s
- 147.91 GB/s
- 197.20 GB/s
triad:
- 43.38 GB/s
- 83.72 GB/s
- 124.83 GB/s
- 166.04 GB/s
daxpy:
- 36.29 GB/s
- 71.29 GB/s
- 103.33 GB/s
- 136.48 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 330.00 kB
- 330.00 kB
- 330.00 kB
- 330.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 35.29 GB/s
- 70.28 GB/s
- 104.67 GB/s
- 139.63 GB/s
copy:
- 42.23 GB/s
- 83.70 GB/s
- 124.33 GB/s
- 167.50 GB/s
update:
- 50.09 GB/s
- 99.77 GB/s
- 149.87 GB/s
- 198.82 GB/s
triad:
- 52.38 GB/s
- 100.00 GB/s
- 147.40 GB/s
- 193.31 GB/s
daxpy:
- 41.14 GB/s
- 80.22 GB/s
- 116.23 GB/s
- 155.08 GB/s
L3:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 22.40 GB/s
- 44.77 GB/s
- 65.71 GB/s
- 89.26 GB/s
copy:
- 25.32 GB/s
- 49.70 GB/s
- 72.89 GB/s
- 98.62 GB/s
update:
- 41.24 GB/s
- 81.14 GB/s
- 122.22 GB/s
- 166.44 GB/s
triad:
- 25.61 GB/s
- 50.02 GB/s
- 73.23 GB/s
- 98.95 GB/s
daxpy:
- 32.07 GB/s
- 62.65 GB/s
- 89.91 GB/s
- 120.65 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 5.28 MB
- 2.64 MB
- 1.76 MB
- 1.32 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 26.18 GB/s
- 51.85 GB/s
- 75.82 GB/s
- 101.39 GB/s
copy:
- 26.22 GB/s
- 51.83 GB/s
- 76.40 GB/s
- 102.84 GB/s
update:
- 43.51 GB/s
- 86.75 GB/s
- 129.86 GB/s
- 174.54 GB/s
triad:
- 26.39 GB/s
- 51.80 GB/s
- 76.27 GB/s
- 102.66 GB/s
daxpy:
- 37.43 GB/s
- 73.16 GB/s
- 106.53 GB/s
- 142.76 GB/s
MEM:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 12.03 GB/s
- 24.38 GB/s
- 34.83 GB/s
- 45.05 GB/s
copy:
- 12.32 GB/s
- 24.40 GB/s
- 32.82 GB/s
- 37.00 GB/s
update:
- 20.83 GB/s
- 40.25 GB/s
- 48.81 GB/s
- 54.84 GB/s
triad:
- 11.64 GB/s
- 23.17 GB/s
- 34.78 GB/s
- 42.97 GB/s
daxpy:
- 17.69 GB/s
- 34.02 GB/s
- 48.12 GB/s
- 55.73 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 120.00 MB
- 60.00 MB
- 40.00 MB
- 30.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 15.33 GB/s
- 28.32 GB/s
- 41.34 GB/s
- 53.02 GB/s
copy:
- 13.96 GB/s
- 26.61 GB/s
- 34.39 GB/s
- 38.96 GB/s
update:
- 26.47 GB/s
- 47.82 GB/s
- 56.70 GB/s
- 62.78 GB/s
triad:
- 14.42 GB/s
- 26.66 GB/s
- 36.94 GB/s
- 44.01 GB/s
daxpy:
- 20.96 GB/s
- 39.12 GB/s
- 51.55 GB/s
- 58.37 GB/s
import math
import os
import time
import numpy as np
import sympy as sp
from git import Repo
from influxdb import InfluxDBClient
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
from kerncraft.prefixedunit import PrefixedUnit
from pystencils import Assignment, Field, create_kernel
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
def output_benchmark(analysis):
output = {}
keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
copies = {key: analysis[key] for key in keys}
output.update(copies)
for cache, metrics in analysis['data transfers'].items():
for metric_name, metric_value in metrics.items():
fixed = metric_value.with_prefix('')
output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
for level, value in analysis['ECM'].items():
output['Phenomenological ECM ' + level + ' cy/CL'] = value
return output
def output_ecm(analysis):
output = {}
keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
copies = {key: analysis[key] for key in keys}
output.update(copies)
if 'memory bandwidth kernel' in analysis:
output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
for key, value in analysis['cycles']:
output[key] = value
return output
def output_roofline(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def output_roofline_iaca(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
# output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def report_analysis(ast, models, machine, tags, fields=None):
kernel = PyStencilsKerncraftKernel(ast, machine)
client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
'roggan', 'pystencils')
repo = Repo(search_parent_directories=True)
commit = repo.head.commit
point_time = int(time.time())
for model in models:
benchmark = model(kernel, machine, KerncraftParameters())
benchmark.analyze()
analysis = benchmark.results
if model is Benchmark:
output = output_benchmark(analysis)
elif model is ECM:
output = output_ecm(analysis)
elif model is Roofline:
output = output_roofline(analysis)
elif model is RooflineIACA:
output = output_roofline_iaca(analysis)
else:
raise ValueError('No valid model for analysis given!')
if fields is not None:
output.update(fields)
output['commit'] = commit.hexsha
json_body = [
{
'measurement': model.__name__,
'tags': tags,
'time': point_time,
'fields': output
}
]
client.write_points(json_body, time_precision='s')
def main():
size = [20, 200, 200]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + \
a[-1, 0, 0] + a[1, 0, 0] + \
a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
input_folder = "./"
machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
machine = MachineModel(path_to_yaml=machine_file_path)
tags = {
'host': os.uname()[1],
'project': 'pystencils',
'kernel': 'jacobi_3D ' + str(size)
}
report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
if __name__ == '__main__':
main()
import numpy as np
import sympy as sp
from pystencils import Assignment, Field, create_kernel
def meassure():
size = [30, 50, 3]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
updateRule = Assignment(b[0, 0], s * rhs)
print(updateRule)
ast = create_kernel([updateRule])
# benchmark = generate_benchmark(ast)
# main = benchmark[0]
# kernel = benchmark[1]
# with open('src/main.cpp', 'w') as file:
# file.write(main)
# with open('src/kernel.cpp', 'w') as file:
# file.write(kernel)
func = ast.compile({'omega': 2/3})
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.kerncraft_coupling import BenchmarkAnalysis
from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECMData
machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
machine = MachineModel(path_to_yaml=machineFilePath)
benchmark = BenchmarkAnalysis(ast, machine)
#TODO what do i want to do with benchmark?
kernel = PyStencilsKerncraftKernel(ast)
model = ECMData(kernel, machine, KerncraftParameters())
model.analyze()
model.report()
if __name__ == "__main__":
meassure()
/*
* Copyright (2008-2009) Intel Corporation All Rights Reserved.
* The source code contained or described herein and all documents
* related to the source code ("Material") are owned by Intel Corporation
* or its suppliers or licensors. Title to the Material remains with
* Intel Corporation or its suppliers and licensors. The Material
* contains trade secrets and proprietary and confidential information
* of Intel or its suppliers and licensors. The Material is protected
* by worldwide copyright and trade secret laws and treaty provisions.
* No part of the Material may be used, copied, reproduced, modified,
* published, uploaded, posted, transmitted, distributed, or disclosed
* in any way without Intel(R)s prior express written permission.
*
* No license under any patent, copyright, trade secret or other
* intellectual property right is granted to or conferred upon you by
* disclosure or delivery of the Materials, either expressly, by implication,
* inducement, estoppel or otherwise. Any license under such intellectual
* property rights must be express and approved by Intel in writing.
*/
#if defined (__GNUC__)
#define IACA_SSC_MARK( MARK_ID ) \
__asm__ __volatile__ ( \
"\n\t movl $"#MARK_ID", %%ebx" \
"\n\t .byte 0x64, 0x67, 0x90" \
: : : "memory" );
#else
#define IACA_SSC_MARK(x) {__asm mov ebx, x\
__asm _emit 0x64 \
__asm _emit 0x67 \
__asm _emit 0x90 }
#endif
#define IACA_START {IACA_SSC_MARK(111)}
#define IACA_END {IACA_SSC_MARK(222)}
#ifdef _WIN64
#include <intrin.h>
#define IACA_VC64_START __writegsbyte(111, 111);
#define IACA_VC64_END __writegsbyte(222, 222);
#endif
/**************** asm *****************
;START_MARKER
mov ebx, 111
db 0x64, 0x67, 0x90
;END_MARKER
mov ebx, 222
db 0x64, 0x67, 0x90
**************************************/
#include "iacaMarks.h"
int main(int argc, char * argv[]){
int a = 0;
for(int i = 0; i < argc+100000; i++){
IACA_START
a += i;
}
IACA_END
return a;
}
double a[30][50][3];
double b[30][50][3];
double s;
for(int j=1; j<30-1; ++j)
for(int i=1; i<50-1; ++i)
b[j][i] = ( a[j][i-1] + a[j][i+1]
+ a[j-1][i] + a[j+1][i]) * s;
double a[M][N][N];
double b[M][N][N];
double s;
for(int k=1; k<M-1; ++k)
for(int j=1; j<N-1; ++j)
for(int i=1; i<N-1; ++i)
b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
+ a[k][j-1][i] + a[k][j+1][i]
+ a[k-1][j][i] + a[k+1][j][i]) * s;
kerncraft version: 0.7.3
clock: 2.7 GHz
cores per socket: 8
cores per NUMA domain: 8
NUMA domains per socket: 1
model type: Intel Core SandyBridge EP processor
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
sockets: 2
threads per core: 2
cacheline size: 64 B
compiler:
!!omap
- icc: -O3 -xAVX -fno-alias -qopenmp
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
micro-architecture: SNB
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
write-allocate: True
memory hierarchy:
- level: L1
cache per group: {
'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L2', 'store_to': 'L2'}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
- level: L2
cache per group: {
'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L3', 'store_to': 'L3'}
cores per group: 1
threads per group: 2
groups: 16
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {
'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True}
cores per group: 8
threads per group: 16
groups: 2
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
size per group: null
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83
GB/s, 571.67 GB/s, 653.50 GB/s]
daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s,
426.71 GB/s, 497.45 GB/s, 568.97 GB/s]
load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46
GB/s, 427.41 GB/s, 490.88 GB/s]
triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s,
487.76 GB/s, 569.10 GB/s, 650.39 GB/s]
update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 GB/s,
503.88 GB/s, 587.86 GB/s, 671.88 GB/s]
size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
16.00 kB, 16.00 kB]
size per thread: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00
kB, 16.00 kB, 16.00 kB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
kB, 128.00 kB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [79.53 GB/s, 158.70 GB/s, 238.20 GB/s, 317.62 GB/s, 397.09 GB/s, 476.33
GB/s, 555.69 GB/s, 634.96 GB/s]
daxpy: [70.94 GB/s, 141.90 GB/s, 212.97 GB/s, 283.91 GB/s, 354.93 GB/s,
425.85 GB/s, 496.74 GB/s, 567.40 GB/s]
load: [57.01 GB/s, 114.11 GB/s, 171.11 GB/s, 228.13 GB/s, 285.15 GB/s, 342.11
GB/s, 399.11 GB/s, 456.11 GB/s]
triad: [79.48 GB/s, 159.03 GB/s, 238.53 GB/s, 318.04 GB/s, 392.11 GB/s,
477.10 GB/s, 538.36 GB/s, 636.02 GB/s]
update: [82.75 GB/s, 165.55 GB/s, 248.50 GB/s, 331.32 GB/s, 414.06 GB/s,
496.82 GB/s, 579.83 GB/s, 662.36 GB/s]
size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
16.00 kB, 16.00 kB]
size per thread: [8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00
kB, 8.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
kB, 128.00 kB]
L2:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [41.28 GB/s, 81.96 GB/s, 120.28 GB/s, 160.70 GB/s, 203.22 GB/s, 239.97
GB/s, 271.13 GB/s, 307.01 GB/s]
daxpy: [48.85 GB/s, 98.62 GB/s, 143.29 GB/s, 197.76 GB/s, 230.58 GB/s, 284.98
GB/s, 334.22 GB/s, 385.72 GB/s]
load: [38.51 GB/s, 76.67 GB/s, 114.73 GB/s, 152.90 GB/s, 188.69 GB/s, 223.64
GB/s, 265.21 GB/s, 289.41 GB/s]
triad: [40.92 GB/s, 83.49 GB/s, 124.48 GB/s, 165.24 GB/s, 206.74 GB/s, 237.90
GB/s, 274.96 GB/s, 329.09 GB/s]
update: [50.37 GB/s, 100.05 GB/s, 145.43 GB/s, 196.82 GB/s, 244.07 GB/s,
301.62 GB/s, 336.88 GB/s, 403.78 GB/s]
size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
size per thread: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
kB, 0.90 MB, 1.02 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [42.17 GB/s, 83.47 GB/s, 124.57 GB/s, 163.78 GB/s, 202.56 GB/s, 242.80
GB/s, 276.95 GB/s, 311.36 GB/s]
daxpy: [50.87 GB/s, 98.72 GB/s, 152.12 GB/s, 193.48 GB/s, 251.36 GB/s, 301.72
GB/s, 352.55 GB/s, 365.28 GB/s]
load: [39.62 GB/s, 79.03 GB/s, 118.03 GB/s, 157.85 GB/s, 196.48 GB/s, 237.44
GB/s, 276.81 GB/s, 309.71 GB/s]
triad: [44.80 GB/s, 88.35 GB/s, 125.13 GB/s, 169.94 GB/s, 209.60 GB/s, 260.15
GB/s, 300.75 GB/s, 333.08 GB/s]
update: [49.80 GB/s, 100.70 GB/s, 150.56 GB/s, 196.44 GB/s, 251.90 GB/s,
280.93 GB/s, 352.74 GB/s, 399.27 GB/s]
size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
size per thread: [64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00
kB, 64.00 kB, 64.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
kB, 0.90 MB, 1.02 MB]
L3:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.21 GB/s, 46.01 GB/s, 67.96 GB/s, 90.17 GB/s, 111.47 GB/s, 133.14
GB/s, 153.84 GB/s, 174.92 GB/s]
daxpy: [30.35 GB/s, 60.32 GB/s, 90.00 GB/s, 119.71 GB/s, 148.87 GB/s, 178.39
GB/s, 207.10 GB/s, 236.25 GB/s]
load: [23.35 GB/s, 46.52 GB/s, 69.57 GB/s, 92.60 GB/s, 115.77 GB/s, 138.89
GB/s, 161.82 GB/s, 184.11 GB/s]
triad: [25.18 GB/s, 50.08 GB/s, 74.33 GB/s, 98.78 GB/s, 122.66 GB/s, 146.78
GB/s, 170.52 GB/s, 194.47 GB/s]
update: [32.67 GB/s, 64.65 GB/s, 95.98 GB/s, 127.29 GB/s, 157.67 GB/s, 188.22
GB/s, 217.41 GB/s, 246.99 GB/s]
size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
size per thread: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
10.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.83 GB/s, 47.25 GB/s, 69.84 GB/s, 92.61 GB/s, 114.31 GB/s, 136.48
GB/s, 157.55 GB/s, 178.99 GB/s]
daxpy: [31.52 GB/s, 62.72 GB/s, 93.43 GB/s, 124.29 GB/s, 154.55 GB/s, 185.18
GB/s, 215.10 GB/s, 245.24 GB/s]
load: [27.63 GB/s, 54.93 GB/s, 81.57 GB/s, 108.63 GB/s, 134.91 GB/s, 161.72
GB/s, 188.15 GB/s, 214.94 GB/s]
triad: [25.90 GB/s, 51.76 GB/s, 76.73 GB/s, 102.29 GB/s, 126.17 GB/s, 152.10
GB/s, 176.71 GB/s, 200.64 GB/s]
update: [34.10 GB/s, 67.67 GB/s, 100.62 GB/s, 133.50 GB/s, 165.61 GB/s,
197.74 GB/s, 228.73 GB/s, 259.05 GB/s]
size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
size per thread: [625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00
kB, 625.00 kB, 625.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
10.00 MB]
MEM:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [11.60 GB/s, 21.29 GB/s, 25.94 GB/s, 27.28 GB/s, 27.47 GB/s, 27.36
GB/s, 27.21 GB/s, 27.12 GB/s]
daxpy: [17.33 GB/s, 31.89 GB/s, 38.65 GB/s, 40.50 GB/s, 40.81 GB/s, 40.62
GB/s, 40.59 GB/s, 40.26 GB/s]
load: [12.01 GB/s, 23.04 GB/s, 32.79 GB/s, 40.21 GB/s, 43.39 GB/s, 44.14
GB/s, 44.42 GB/s, 44.40 GB/s]
triad: [12.73 GB/s, 24.27 GB/s, 30.43 GB/s, 31.46 GB/s, 31.77 GB/s, 31.74
GB/s, 31.65 GB/s, 31.52 GB/s]
update: [18.91 GB/s, 32.43 GB/s, 37.28 GB/s, 39.98 GB/s, 40.99 GB/s, 40.92
GB/s, 40.61 GB/s, 40.34 GB/s]
size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
size per thread: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [10.92 GB/s, 20.62 GB/s, 25.34 GB/s, 26.22 GB/s, 26.32 GB/s, 26.31
GB/s, 26.22 GB/s, 26.16 GB/s]
daxpy: [17.15 GB/s, 31.96 GB/s, 38.12 GB/s, 39.19 GB/s, 39.38 GB/s, 39.16
GB/s, 39.06 GB/s, 38.87 GB/s]
load: [13.49 GB/s, 25.92 GB/s, 36.16 GB/s, 41.56 GB/s, 43.34 GB/s, 43.40
GB/s, 43.01 GB/s, 42.66 GB/s]
triad: [12.38 GB/s, 23.17 GB/s, 28.69 GB/s, 29.98 GB/s, 30.50 GB/s, 30.59
GB/s, 30.75 GB/s, 30.70 GB/s]
update: [19.67 GB/s, 34.93 GB/s, 39.93 GB/s, 40.79 GB/s, 40.43 GB/s, 40.03
GB/s, 39.62 GB/s, 39.33 GB/s]
size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
size per thread: [20.00 MB, 10.00 MB, 6.67 MB, 5.00 MB, 4.00 MB, 3.33 MB,
2.86 MB, 2.50 MB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
"""
Test of pystencils.data_types.address_of
"""
import pystencils
from pystencils.data_types import PointerType, address_of, cast_func, create_type
from pystencils.simp.simplifications import sympy_cse
def test_address_of():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
assignments = pystencils.AssignmentCollection({
s: address_of(x[0, 0]),
y[0, 0]: cast_func(s, create_type('int64'))
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
print(code)
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64'))
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
print(code)
def test_address_of_with_cse():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + s,
x[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + 1
}, {})
ast = pystencils.create_kernel(assignments)
code = pystencils.show_code(ast)
assignments_cse = sympy_cse(assignments)
ast = pystencils.create_kernel(assignments_cse)
code = pystencils.show_code(ast)
print(code)
import sympy as sp
from pystencils import Assignment, AssignmentCollection
from pystencils.astnodes import Conditional
from pystencils.simp.assignment_collection import SymbolGen
def test_assignment_collection():
x, y, z, t = sp.symbols("x y z t")
symbol_gen = SymbolGen("a")
ac = AssignmentCollection([Assignment(z, x + y)],
[], subexpression_symbol_generator=symbol_gen)
lhs = ac.add_subexpression(t)
assert lhs == sp.Symbol("a_0")
ac.subexpressions.append(Assignment(t, 3))
ac.topological_sort(sort_main_assignments=False, sort_subexpressions=True)
assert ac.subexpressions[0].lhs == t
assert ac.new_with_inserted_subexpression(sp.Symbol("not_defined")) == ac
ac_inserted = ac.new_with_inserted_subexpression(t)
ac_inserted2 = ac.new_without_subexpressions({lhs})
assert all(a == b for a, b in zip(ac_inserted.all_assignments, ac_inserted2.all_assignments))
print(ac_inserted)
assert ac_inserted.subexpressions[0] == Assignment(lhs, 3)
assert 'a_0' in str(ac_inserted)
assert '<table' in ac_inserted._repr_html_()
def test_free_and_defined_symbols():
x, y, z, t = sp.symbols("x y z t")
a, b = sp.symbols("a b")
symbol_gen = SymbolGen("a")
ac = AssignmentCollection([Assignment(z, x + y), Conditional(t > 0, Assignment(a, b+1), Assignment(a, b+2))],
[], subexpression_symbol_generator=symbol_gen)
print(ac)
print(ac.__repr__)
%% Cell type:markdown id: tags:
# pystencils - LLVM generation
The generation of LLVM code is simliar but not identical as seen with the C++ version. For the generation itself a python module ``llvmlite`` is used. This module provides the necessary support and bindings for LLVM. In order to generate from the AST to llvm, the AST needs to be transformed to support type conversions. This is the biggest difference to the C++ version. C++ doesn't need that since the language itself handles the casts.
In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation.
%% Cell type:code id: tags:
``` python
import sympy as sp
import numpy as np
import ctypes
from pystencils import Field, Assignment
from pystencils import create_kernel
from pystencils.display_utils import to_dot
sp.init_printing()
```
%% Cell type:markdown id: tags:
The numpy arrays (with inital values) create *Field*s for the update Rule. Later those arrays are used for the computation itself.
%% Cell type:code id: tags:
``` python
src_arr = np.zeros((30, 20))
src_arr[0,:] = 1.0
src_arr[:,0] = 1.0
dst_arr = src_arr.copy()
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
```
%% Cell type:markdown id: tags:
Using the *Field* objects and the additional *Symbol* $\omega$ for the weight the update rule is specified as a *sympy* equation.
%% Cell type:code id: tags:
``` python
omega = sp.symbols("omega")
update_rule = Assignment(dst_field[0,0], omega * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]) / 4
+ (1.-omega)*src_field[0,0])
update_rule
```
%% Output
ω⋅(src_E + src_N + src_S + src_W)
dst_C := src_C⋅(-ω + 1.0) + ─────────────────────────────────
4
%% Cell type:markdown id: tags:
With this update rule an abstract syntax tree (AST) can be created. This AST can be used to print the LLVM code. The creation follows the same routines as the C++ version does. However at the end there are two more steps. In order to generate LLVM, type casting and pointer arithmetic had to be introduced (which C++ does for you).
%% Cell type:code id: tags:
``` python
ast = create_kernel([update_rule], target='llvm')
print(str(ast))
```
%% Output
KernelFunction kernel([<double * RESTRICT fd_dst>, <double * RESTRICT const fd_src>, <double omega>])
Block for(ctr_0=1; ctr_0<29; ctr_0+=1)
Block fd_dst_C ← pointer_arithmetic_func(fd_dst, 20*ctr_0)
fd_src_C ← pointer_arithmetic_func(fd_src, 20*ctr_0)
fd_src_E ← pointer_arithmetic_func(fd_src, 20*ctr_0 + 20)
fd_src_W ← pointer_arithmetic_func(fd_src, 20*ctr_0 - 20)
for(ctr_1=1; ctr_1<19; ctr_1+=1)
Block fd_dst_C[ctr_1] ← omega*(fd_src_C[ctr_1 + 1] + fd_src_C[ctr_1 - 1] + fd_src_E[ctr_1] + fd_src_W[ctr_1])/4 + (omega*cast_func(-1, double) + 1.0)*fd_src_C[ctr_1]
%% Cell type:markdown id: tags:
It is possible to examine the AST further.
%% Cell type:code id: tags:
``` python
to_dot(ast)
```
%% Output
<graphviz.files.Source at 0x7f2d14f276a0>
%% Cell type:markdown id: tags:
With transformed AST it is now possible to generate and compile the AST into LLVM. Notice that unlike in C++ version, no files are writen to the hard drive (although it is possible).
There are multiple ways how to generate and compile the AST. The most simple one is simillar to the C++ version. Using the ``compile()`` function of the generated AST
You can also manually create a python function with ``make_python_function``.
Another option is obtaining the jit itself with ``generate_and_jit``.
The function ``generate_and_jit`` first generates and the compiles the AST.
If even more controll is needed, it is possible to use the functions ``generateLLVM`` and ``compileLLVM`` to achieve the same. For further controll, instead of calling ``compileLLVM`` the jit object itself can be created and its necessary functions for the compilation have to be run manually (``parse``, (``optimize``,) ``compile``)
%% Cell type:code id: tags:
``` python
kernel = ast.compile()
#kernel = make_python_function(ast)
# Or alternativally
#jit = generate_and_jit(ast)
# Call: jit('kernel', src_arr, dst_arr)
```
%% Cell type:markdown id: tags:
The compiled function(s) can be used now. Either call the function (with arguments, if not given before) or call the jit object with the function's name and its arguments. Here, numpy arrays are automatically adjusted with ctypes.
The functions and arguments can be read as well.
**All of the information the jit object has comes from the module which was parsed. If you parse a second module and don't run the compilation step, the module and the compiled code are not the same anymore, thus leading to false information**
%% Cell type:code id: tags:
``` python
#jit.print_functions()
```
%% Cell type:code id: tags:
``` python
for i in range(100):
kernel(src=src_arr, dst=dst_arr, omega=2/3)
src_arr, dst_arr = dst_arr, src_arr
```
%% Cell type:markdown id: tags:
The output is drawn with matplotlib.
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
from matplotlib import cm
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(dst_arr, cmap=cm.jet)
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
```
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import itertools
import numpy as np
import pytest
import sympy
from sympy.functions import im, re
import pystencils
from pystencils import AssignmentCollection
from pystencils.data_types import TypedSymbol, create_type
X, Y = pystencils.fields('x, y: complex64[2d]')
A, B = pystencils.fields('a, b: float32[2d]')
S1, S2, T = sympy.symbols('S1, S2, T')
TEST_ASSIGNMENTS = [
AssignmentCollection({X[0, 0]: 1j}),
AssignmentCollection({
S1: re(Y.center),
S2: im(Y.center),
X[0, 0]: 2j * S1 + S2
}),
AssignmentCollection({
A.center: re(Y.center),
B.center: im(Y.center),
}),
AssignmentCollection({
Y.center: re(Y.center) + X.center + 2j,
}),
AssignmentCollection({
T: 2 + 4j,
Y.center: X.center / T,
})
]
SCALAR_DTYPES = ['float32', 'float64']
@pytest.mark.parametrize("assignment, scalar_dtypes",
itertools.product(TEST_ASSIGNMENTS, (np.float32,)))
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_complex_numbers(assignment, scalar_dtypes, target):
ast = pystencils.create_kernel(assignment,
target=target,
data_type=scalar_dtypes)
code = str(pystencils.show_code(ast))
print(code)
assert "Not supported" not in code
kernel = ast.compile()
assert kernel is not None
X, Y = pystencils.fields('x, y: complex128[2d]')
A, B = pystencils.fields('a, b: float64[2d]')
S1, S2 = sympy.symbols('S1, S2')
T128 = TypedSymbol('ts', create_type('complex128'))
TEST_ASSIGNMENTS = [
AssignmentCollection({X[0, 0]: 1j}),
AssignmentCollection({
S1: re(Y.center),
S2: im(Y.center),
X[0, 0]: 2j * S1 + S2
}),
AssignmentCollection({
A.center: re(Y.center),
B.center: im(Y.center),
}),
AssignmentCollection({
Y.center: re(Y.center) + X.center + 2j,
}),
AssignmentCollection({
T128: 2 + 4j,
Y.center: X.center / T128,
})
]
SCALAR_DTYPES = ['float64']
@pytest.mark.parametrize("assignment", TEST_ASSIGNMENTS)
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_complex_numbers_64(assignment, target):
ast = pystencils.create_kernel(assignment,
target=target,
data_type='double')
code = str(pystencils.show_code(ast))
print(code)
assert "Not supported" not in code
kernel = ast.compile()
assert kernel is not None
@pytest.mark.parametrize('dtype', (np.float32, np.float64))
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
@pytest.mark.parametrize('with_complex_argument', ('with_complex_argument', False))
def test_complex_execution(dtype, target, with_complex_argument):
complex_dtype = f'complex{64 if dtype ==np.float32 else 128}'
x, y = pystencils.fields(f'x, y: {complex_dtype}[2d]')
x_arr = np.zeros((20, 30), complex_dtype)
y_arr = np.zeros((20, 30), complex_dtype)
if with_complex_argument:
a = pystencils.TypedSymbol('a', create_type(complex_dtype))
else:
a = (2j+1)
assignments = AssignmentCollection({
y.center: x.center + a
})
if target == 'gpu':
from pycuda.gpuarray import zeros
x_arr = zeros((20, 30), complex_dtype)
y_arr = zeros((20, 30), complex_dtype)
kernel = pystencils.create_kernel(assignments, target=target, data_type=dtype).compile()
if with_complex_argument:
kernel(x=x_arr, y=y_arr, a=2j+1)
else:
kernel(x=x_arr, y=y_arr)
if target == 'gpu':
y_arr = y_arr.get()
assert np.allclose(y_arr, 2j+1)
import numpy as np
import sympy as sp
import pystencils as ps
from pystencils.astnodes import Block, Conditional
from pystencils.cpu.vectorization import vec_all, vec_any
def test_vec_any():
data_arr = np.zeros((15, 15))
data_arr[3:9, 2:7] = 1.0
data = ps.fields("data: double[2D]", data=data_arr)
c = [
ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
Conditional(vec_any(data.center() > 0.0), Block([
ps.Assignment(data.center(), 2.0)
]))
]
ast = ps.create_kernel(c, target='cpu',
cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
kernel(data=data_arr)
np.testing.assert_equal(data_arr[3:9, 0:8], 2.0)
def test_vec_all():
data_arr = np.zeros((15, 15))
data_arr[3:9, 2:7] = 1.0
data = ps.fields("data: double[2D]", data=data_arr)
c = [
Conditional(vec_all(data.center() > 0.0), Block([
ps.Assignment(data.center(), 2.0)
]))
]
ast = ps.create_kernel(c, target='cpu',
cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
before = data_arr.copy()
kernel(data=data_arr)
np.testing.assert_equal(data_arr, before)
def test_boolean_before_loop():
t1, t2 = sp.symbols('t1, t2')
f_arr = np.ones((10, 10))
g_arr = np.zeros_like(f_arr)
f, g = ps.fields("f, g : double[2D]", f=f_arr, g=g_arr)
a = [
ps.Assignment(t1, t2 > 0),
ps.Assignment(g[0, 0],
sp.Piecewise((f[0, 0], t1), (42, True)))
]
ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
kernel(f=f_arr, g=g_arr, t2=1.0)
print(g)
np.testing.assert_array_equal(g_arr, 1.0)
kernel(f=f_arr, g=g_arr, t2=-1.0)
np.testing.assert_array_equal(g_arr, 42.0)
import sympy
import pystencils
from pystencils.astnodes import get_dummy_symbol
from pystencils.backends.cuda_backend import CudaSympyPrinter
from pystencils.data_types import address_of
def test_cuda_known_functions():
printer = CudaSympyPrinter()
print(printer.known_functions)
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
y.center(): sympy.Function('rsqrtf')(x[0, 0])
})
ast = pystencils.create_kernel(assignments, 'gpu')
print(pystencils.show_code(ast))
kernel = ast.compile()
assert(kernel is not None)
def test_cuda_but_not_c():
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
y.center(): sympy.Function('rsqrtf')(x[0, 0])
})
ast = pystencils.create_kernel(assignments, 'cpu')
print(pystencils.show_code(ast))
def test_cuda_unknown():
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('wtf')(address_of(y.center()), 2),
})
ast = pystencils.create_kernel(assignments, 'gpu')
code = str(pystencils.show_code(ast))
print(code)
import numpy as np
import waLBerla as wlb
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
from pystencils_tests.test_datahandling import (
access_and_gather, kernel_execution_jacobi, reduction, synchronization, vtk_output)
def test_access_and_gather():
block_size = (4, 7, 1)
num_blocks = (3, 2, 1)
cells = tuple(a * b for a, b in zip(block_size, num_blocks))
blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False,
periodic=(1, 1, 1))
dh = ParallelDataHandling(blocks, default_ghost_layers=2)
access_and_gather(dh, cells)
synchronization(dh, test_gpu=False)
if hasattr(wlb, 'cuda'):
synchronization(dh, test_gpu=True)
def test_gpu():
if not hasattr(wlb, 'cuda'):
print("Skip GPU tests because walberla was built without CUDA")
return
block_size = (4, 7, 1)
num_blocks = (3, 2, 1)
blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
dh = ParallelDataHandling(blocks, default_ghost_layers=2)
dh.add_array('v', values_per_cell=3, dtype=np.int64, ghost_layers=2, gpu=True)
for b in dh.iterate():
b['v'].fill(42)
dh.all_to_gpu()
for b in dh.iterate():
b['v'].fill(0)
dh.to_cpu('v')
for b in dh.iterate():
np.testing.assert_equal(b['v'], 42)
def test_kernel():
for gpu in (True, False):
if gpu and not hasattr(wlb, 'cuda'):
print("Skipping CUDA tests because walberla was built without GPU support")
continue
# 3D
blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
dh = ParallelDataHandling(blocks)
kernel_execution_jacobi(dh, 'gpu')
reduction(dh)
# 2D
blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False)
dh = ParallelDataHandling(blocks, dim=2)
kernel_execution_jacobi(dh, 'gpu')
reduction(dh)
def test_vtk_output():
blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
dh = ParallelDataHandling(blocks)
vtk_output(dh)
%% Cell type:code id: tags:
``` python
from pystencils.session import *
from pystencils.fd.derivation import *
```
%% Cell type:markdown id: tags:
# 2D standard stencils
%% Cell type:code id: tags:
``` python
stencil = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)]
standard_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
f = ps.fields("f: [2D]")
standard_2d_00_res = standard_2d_00.get_stencil()
res = standard_2d_00_res.apply(f.center)
expected = f[-1, 0] - 2 * f[0, 0] + f[1, 0]
assert res == expected
```
%% Cell type:code id: tags:
``` python
assert standard_2d_00_res.accuracy == 2
assert not standard_2d_00_res.is_isotropic
standard_2d_00_res
```
%% Output
Finite difference stencil of accuracy 2, isotropic error: False
%% Cell type:code id: tags:
``` python
standard_2d_00.get_stencil().as_matrix()
```
%% Output
$$\left[\begin{matrix}0 & 0 & 0\\1 & -2 & 1\\0 & 0 & 0\end{matrix}\right]$$
⎡0 0 0⎤
⎢ ⎥
⎢1 -2 1⎥
⎢ ⎥
⎣0 0 0⎦
%% Cell type:markdown id: tags:
# 2D isotropic stencils
## second x-derivative
%% Cell type:code id: tags:
``` python
stencil = [(i, j) for i in (-1, 0, 1) for j in (-1, 0, 1)]
isotropic_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
isotropic_2d_00_res = isotropic_2d_00.get_stencil(isotropic=True)
assert isotropic_2d_00_res.is_isotropic
assert isotropic_2d_00_res.accuracy == 2
isotropic_2d_00_res
```
%% Output
Finite difference stencil of accuracy 2, isotropic error: True
%% Cell type:code id: tags:
``` python
isotropic_2d_00_res.as_matrix()
```
%% Output
$$\left[\begin{matrix}\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\\\frac{5}{6} & - \frac{5}{3} & \frac{5}{6}\\\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\end{matrix}\right]$$
⎡1/12 -1/6 1/12⎤
⎢ ⎥
⎢5/6 -5/3 5/6 ⎥
⎢ ⎥
⎣1/12 -1/6 1/12⎦
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(2,2))
isotropic_2d_00_res.visualize()
```
%% Output
%% Cell type:code id: tags:
``` python
expected_result = sp.Matrix([[1, -2, 1], [10, -20, 10], [1, -2, 1]]) / 12
assert expected_result == isotropic_2d_00_res.as_matrix()
```
%% Cell type:code id: tags:
``` python
type(isotropic_2d_00_res.as_matrix())
```
%% Output
sympy.matrices.dense.MutableDenseMatrix
%% Cell type:code id: tags:
``` python
type(expected_result)
```
%% Output
sympy.matrices.dense.MutableDenseMatrix
%% Cell type:markdown id: tags:
## Isotropic laplacian
%% Cell type:code id: tags:
``` python
isotropic_2d_11 = FiniteDifferenceStencilDerivation((1,1), stencil)
isotropic_2d_11_res = isotropic_2d_11.get_stencil(isotropic=True)
iso_laplacian = isotropic_2d_00_res.as_matrix() + isotropic_2d_11_res.as_matrix()
iso_laplacian
```
%% Output
$$\left[\begin{matrix}\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\\\frac{2}{3} & - \frac{10}{3} & \frac{2}{3}\\\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\end{matrix}\right]$$
⎡1/6 2/3 1/6⎤
⎢ ⎥
⎢2/3 -10/3 2/3⎥
⎢ ⎥
⎣1/6 2/3 1/6⎦
%% Cell type:code id: tags:
``` python
expected_result = sp.Matrix([[1, 4, 1], [4, -20, 4], [1, 4, 1]]) / 6
assert iso_laplacian == expected_result
```
%% Cell type:markdown id: tags:
# stencils for staggered fields
%% Cell type:code id: tags:
``` python
half = sp.Rational(1, 2)
fd_points_ex = (
(half, 0),
(-half, 0),
(half, 1),
(half, -1),
(-half, 1),
(-half, -1)
)
assert set(fd_points_ex) == set(FiniteDifferenceStaggeredStencilDerivation("E", 2).stencil)
fd_points_ey = (
(0, half),
(0, -half),
(-1,-half),
(-1, half),
(1, -half),
(1, half)
)
assert set(fd_points_ey) == set(FiniteDifferenceStaggeredStencilDerivation("N",2).stencil)
fd_points_c = (
(half, half),
(-half, -half),
(half, -half),
(-half, half)
)
assert set(fd_points_c) == set(FiniteDifferenceStaggeredStencilDerivation("NE",2).stencil)
assert len(FiniteDifferenceStaggeredStencilDerivation("E",3).points) == 10
assert len(FiniteDifferenceStaggeredStencilDerivation("NE",3).points) == 12
assert len(FiniteDifferenceStaggeredStencilDerivation("TNE",3).points) == 8
```
%% Cell type:code id: tags:
``` python
c = ps.fields("c: [2D]")
c3 = ps.fields("c3: [3D]")
assert FiniteDifferenceStaggeredStencilDerivation("E", 2, (0,)).apply(c.center) == c[1, 0] - c[0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("W", 2, (0,)).apply(c.center) == c[0, 0] - c[-1, 0]
assert FiniteDifferenceStaggeredStencilDerivation("N", 2, (1,)).apply(c.center) == c[0, 1] - c[0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("S", 2, (1,)).apply(c.center) == c[0, 0] - c[0, -1]
assert FiniteDifferenceStaggeredStencilDerivation("E", 3, (0,)).apply(c3.center) == c3[1, 0, 0] - c3[0, 0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("W", 3, (0,)).apply(c3.center) == c3[0, 0, 0] - c3[-1, 0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("N", 3, (1,)).apply(c3.center) == c3[0, 1, 0] - c3[0, 0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("S", 3, (1,)).apply(c3.center) == c3[0, 0, 0] - c3[0, -1, 0]
assert FiniteDifferenceStaggeredStencilDerivation("T", 3, (2,)).apply(c3.center) == c3[0, 0, 1] - c3[0, 0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("B", 3, (2,)).apply(c3.center) == c3[0, 0, 0] - c3[0, 0, -1]
assert FiniteDifferenceStaggeredStencilDerivation("S", 2, (0,)).apply(c.center) == \
(c[1, 0] + c[1, -1] - c[-1, 0] - c[-1, -1])/4
assert FiniteDifferenceStaggeredStencilDerivation("NE", 2, (0,)).apply(c.center) + \
FiniteDifferenceStaggeredStencilDerivation("NE", 2, (1,)).apply(c.center) == c[1, 1] - c[0, 0]
assert FiniteDifferenceStaggeredStencilDerivation("NE", 3, (0,)).apply(c3.center) + \
FiniteDifferenceStaggeredStencilDerivation("NE", 3, (1,)).apply(c3.center) == c3[1, 1, 0] - c3[0, 0, 0]
```
%% Cell type:code id: tags:
``` python
d = FiniteDifferenceStaggeredStencilDerivation("NE", 2, (0, 1))
assert d.apply(c.center) == c[0,0] + c[1,1] - c[1,0] - c[0,1]
d.visualize()
```
%% Output
%% Cell type:code id: tags:
``` python
v3 = ps.fields("v(3): [3D]")
for i in range(*v3.index_shape):
assert FiniteDifferenceStaggeredStencilDerivation("E", 3, (0,)).apply(v3.center_vector[i]) == \
v3[1,0,0](i) - v3[0,0,0](i)
```
%% Cell type:code id: tags:
``` python
```