Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3386 deletions
#pragma once
extern "C++" {
#ifdef __CUDA_ARCH__
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
DTYPE_T shape[DIMENSION];
DTYPE_T stride[DIMENSION];
};
#else
#include <array>
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
std::array<DTYPE_T, DIMENSION> shape;
std::array<DTYPE_T, DIMENSION> stride;
};
#endif
}
#if !defined(__AES__) || !defined(__SSE4_1__)
#error AES-NI and SSE4.1 need to be enabled
#endif
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX512VL__
#include <immintrin.h> // AVX*
#else
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#include <cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
__m128i x = _mm_xor_si128(k, in);
x = _mm_aesenc_si128(x, k); // 1
x = _mm_aesenc_si128(x, k); // 2
x = _mm_aesenc_si128(x, k); // 3
x = _mm_aesenc_si128(x, k); // 4
x = _mm_aesenc_si128(x, k); // 5
x = _mm_aesenc_si128(x, k); // 6
x = _mm_aesenc_si128(x, k); // 7
x = _mm_aesenc_si128(x, k); // 8
x = _mm_aesenc_si128(x, k); // 9
x = _mm_aesenclast_si128(x, k); // 10
return x;
}
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result
alignas(16) double rr[2];
_mm_store_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result
alignas(16) float r[4];
_mm_store_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
This diff is collapsed.
#ifndef OPENCL_STDINT
#define OPENCL_STDINT
typedef unsigned int uint_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
#endif
#include <cstdint>
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
}
\ No newline at end of file
This diff is collapsed.
from .generate_benchmark import generate_benchmark, run_c_benchmark
from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
import subprocess
import warnings
import tempfile
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
Args:
ast: the pystencils AST object as returned by create_kernel
likwid: if True likwid markers are added to the code
openmp: relevant only if likwid=True, to generated correct likwid initialization code
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
accessed_fields = {f.name: f for f in ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Strip "#pragma omp parallel" from within kernel, because main function takes care of that
# when likwid and openmp are enabled
if likwid and openmp:
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
jinja_context = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
'kernelName': ast.function_name,
'fields': fields,
'constants': constants,
'call_argument_list': ",".join(call_parameters),
'includes': includes,
'timing': timing,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
return env.get_template('benchmark.c').render(**jinja_context)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
"""Runs the given kernel with outer loop in C
Args:
ast: pystencils ast which is used to compile the benchmark file
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
path: path where the benchmark file is stored. If None a tmp folder is created
Returns:
list of times per iterations for each outer iteration
"""
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
if path is None:
path = tempfile.mkdtemp()
if isinstance(path, str):
path = Path(path)
with open(path / 'bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = Path(kerncraft.__file__).parent
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + str(kerncraft_path / 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
kerncraft_path / 'headers' / 'timing.c',
kerncraft_path / 'headers' / 'dummy.c',
path / 'bench.c',
'-o', path / 'bench',
]
run_compile_step(compile_cmd)
time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
benchmark_time_limit = 20
if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
f"{benchmark_time_limit} seconds for this kernel")
warnings.warn(warn)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
import fcntl
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
from jinja2 import Environment, PackageLoader, StrictUndefined
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.field import get_layout_from_strides
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.kernelcreation import add_openmp
class PyStencilsKerncraftKernel(KernelCode):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
Args:
ast: pystencils ast
machine: kerncraft machine model - specify this if kernel needs to be compiled
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
debug_print: print debug information
filename: used for caching
"""
super(KernelCode, self).__init__(machine=machine)
# Initialize state
self.asm_block = None
self._filename = filename
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
self._keep_intermediates = debug_print
# Loops
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST")
else:
if len(inner_loops) > 1:
warnings.warn("pystencils AST contains multiple inner loops. "
"Only one can be analyzed - choosing first one")
inner_loop = inner_loops[0]
self._loop_stack = []
cur_node = inner_loop
while cur_node is not None:
if isinstance(cur_node, LoopOverCoordinate):
loop_counter_sym = cur_node.loop_counter_symbol
loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
# If the correct step were to be provided, all access within that step length will
# also need to be passed to kerncraft: cur_node.step)
self._loop_stack.append(loop_info)
cur_node = cur_node.parent
self._loop_stack = list(reversed(self._loop_stack))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
def get_layout_tuple(f):
if f.has_fixed_shape:
return get_layout_from_strides(f.strides)
else:
layout_list = list(f.layout)
for _ in range(f.index_dimensions):
layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
return layout_list
reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idx_coordinate_values)
layout = get_layout_tuple(fa.field)
permuted_coord = [sp.sympify(coord[i]) for i in layout]
target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays)
fields_accessed = ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operation_count = count_operations_in_ast(inner_loop)
self._flops = {
'+': operation_count['adds'],
'*': operation_count['muls'],
'/': operation_count['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
if debug_print:
from pprint import pprint
print("----------------------------- Loop Stack --------------------------")
pprint(self._loop_stack)
print("----------------------------- Sources -----------------------------")
pprint(self.sources)
print("----------------------------- Destinations ------------------------")
pprint(self.destinations)
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def get_kernel_header(self, name='pystencils_kernel'):
file_name = "pystencils_kernel.h"
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = {
'function_signature': function_signature,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.h').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
"""
Generate and return compilable source code.
Args:
openmp: if true, openmp code will be generated
name: kernel name
"""
filename = 'pystencils_kernl'
if openmp:
filename += '-omp'
filename += '.c'
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
if openmp:
add_openmp(self.kernel_ast)
kernel_code = generate_c(self.kernel_ast, dialect='c')
jinja_context = {
'includes': includes,
'kernel_code': kernel_code,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
class KerncraftParameters(DotDict):
def __init__(self, **kwargs):
super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
self['incore_model'] = 'OSACA'
# ------------------------------------------- Helper functions ---------------------------------------------------------
def search_resolved_field_accesses_in_ast(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
read_accesses = set()
write_accesses = set()
visit(ast, read_accesses, write_accesses)
return read_accesses, write_accesses
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.