Skip to content
Snippets Groups Projects
Commit e9eb29b9 authored by Martin Bauer's avatar Martin Bauer
Browse files

PACXX benchmark generation

parent 649e82b4
No related merge requests found
import os
from time import perf_counter
import subprocess
from tempfile import TemporaryDirectory
from pystencils import create_data_handling
from pystencils.backends.cbackend import CBackend
from jinja2 import Environment, FileSystemLoader
from pystencils.backends.cbackend import generate_c
script_path = os.path.dirname(os.path.realpath(__file__))
PAXX_ROOT = '/local/bauer/code/pacxx/install'
DEFAULT_PAXX_COMPILE_OPTIONS = ('-Ofast', '-march=native')
def generate_benchmark_code(target_file, kernel_ast, target):
assert target in ('cpu', 'gpu')
assert hasattr(kernel_ast, 'indexing'), "AST has to be a CUDA kernel in order to create a PACXX kernel from it"
backend = CBackend()
function_body = kernel_ast.body
f_sizes = {f.shape[-1] for f in kernel_ast.fields_accessed}
assert len(f_sizes) == 1
env = Environment(loader=FileSystemLoader(script_path))
result = env.get_template("benchmark_template.cpp").render(f_size=f_sizes.pop(),
code=backend(function_body),
target=target)
with open(target_file, 'w') as f:
f.write(result)
def pacxx_compile(source, executable, options=DEFAULT_PAXX_COMPILE_OPTIONS):
command = ['pacxx++', *options, source, '-o', executable, ]
env = os.environ.copy()
env['PATH'] = "{}:{}".format(env.get('PATH', ''), os.path.join(PAXX_ROOT, 'bin'))
env['LD_LIBRARY_PATH'] = "{}:{}".format(env.get('LD_LIBRARY_PATH', ''), os.path.join(PAXX_ROOT, 'lib'))
try:
subprocess.check_output(command, env=env, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
print(" ".join(command))
print(e.output.decode('utf8'))
raise e
def run_paxx_benchmark(executable, domain_size, iterations):
assert len(domain_size) == 3
arguments = [executable, *domain_size, iterations]
arguments = [str(e) for e in arguments]
output = subprocess.check_output(arguments)
return float(output) / iterations
def paxx_benchmark(ast, domain_size, iterations, target='cpu', compile_options=DEFAULT_PAXX_COMPILE_OPTIONS):
"""Generates, compiles and runs the kernel with PAXX
Args:
ast: pystencils AST object (has to be generated for CUDA, even when run on CPU with pacxx)
domain_size: x, y, z extent of spatial domain
iterations: number of outer iterations
target: either 'cpu' or 'gpu' to specify where pacxx should run the kernel
compile_options: compile options for pacxx
Returns:
seconds for one outer iteration
"""
with TemporaryDirectory() as base_dir:
code = os.path.join(base_dir, 'code.cpp')
executable = os.path.join(base_dir, 'bench')
generate_benchmark_code(code, ast, target)
pacxx_compile(code, executable, compile_options)
time_per_iteration = run_paxx_benchmark(executable, domain_size, iterations)
return time_per_iteration
def lbm_performance_compare(domain_size, iterations, **lb_params):
"""Runs benchmark with pacxx and with normal pystencils backends.
Args:
domain_size: 3-tuple with size of spatial domain
iterations: number of outer iterations
**lb_params: parameters passed to lbmpy to choose lattice Boltzmann algorithm & optimization options
Returns:
dictionary with measurements of time per iteration for different backends
"""
import pycuda.driver as drv
from lbmpy.creationfunctions import create_lb_ast
if 'optimization' not in lb_params:
lb_params['optimization'] = {}
lb_params['optimization']['target'] = 'cpu'
cpu_ast = create_lb_ast(**lb_params)
lb_params['optimization']['target'] = 'gpu'
gpu_ast = create_lb_ast(**lb_params)
# print kernel code of CPU or GPU version - just for comparison, files are not used
with open("pystencils_cpu_code.c", 'w') as f:
print(generate_c(cpu_ast), file=f)
with open("pystencils_gpu_code.cu", 'w') as f:
print(generate_c(gpu_ast), file=f)
cpu_kernel = cpu_ast.compile()
gpu_kernel = gpu_ast.compile()
f_sizes = {f.shape[-1] for f in cpu_ast.fields_accessed}
assert len(f_sizes) == 1
f_size = f_sizes.pop()
dh = create_data_handling(domain_size, default_target='gpu', default_layout='fzyx')
dh.add_array('src', values_per_cell=f_size)
dh.add_array('dst', values_per_cell=f_size)
dh.fill('src', 0)
dh.fill('dst', 0)
# to keep it simple we run outer loop directly from Python
# make domain size large enough, otherwise we measure the python call overhead
def run_benchmark(kernel):
dh.all_to_gpu()
for i in range(10): # warmup
dh.run_kernel(kernel)
drv.Context.synchronize()
start = perf_counter()
for i in range(iterations):
dh.run_kernel(kernel)
drv.Context.synchronize()
return (perf_counter() - start) / iterations
return {
'pystencils_cpu': run_benchmark(cpu_kernel),
'pystencils_gpu': run_benchmark(gpu_kernel),
'pacxx_cpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='cpu'),
'pacxx_gpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='gpu'),
}
if __name__ == '__main__':
no_opt = {
'openmp': 8, # number of threads - pacxx uses also HT cores
'split': False,
'vectorization': False,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
}
only_vectorization = {
'openmp': 4,
'split': False,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
'vectorization': {'instruction_set': 'avx',
'assume_inner_stride_one': True,
'nontemporal': False},
}
best = {
'openmp': 4,
'split': True,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
'vectorization': {'instruction_set': 'avx',
'assume_inner_stride_one': True,
'nontemporal': True}
}
res = lbm_performance_compare(stencil='D3Q19', relaxation_rate=1.8, compressible=False,
domain_size=(512, 128, 32), iterations=500,
optimization=only_vectorization)
cpu_speedup = ((res['pacxx_cpu'] / res['pystencils_cpu']) - 1) * 100
gpu_speedup = ((res['pacxx_gpu'] / res['pystencils_gpu']) - 1) * 100
print("Time for one kernel call [s]")
for config_name, time in res.items():
print(" {0: <16}: {1}".format(config_name, time))
print("CPU {:.02f}% GPU {:.02f}%".format(cpu_speedup, gpu_speedup))
#include <PACXX.h>
#include <vector>
#include <sstream>
#include <iostream>
#include <chrono>
using namespace pacxx::v2;
size_t division_round_up(size_t a, size_t b)
{
if( a % b == 0)
return a / b;
else
return (a / b) + 1;
}
int main(int argc, char** argv)
{
{% if target == 'cpu' %}
Executor::Create<NativeRuntime>(0);
{% elif target == 'gpu' %}
Executor::Create<CUDARuntime>(0);
{% endif %}
if( argc != 5 ) {
std::cout << "Usage: ./benchmark xSize ySize zSize iterations" << std::endl;
return 1;
}
Dimension3 domainSize;
int64_t iterations;
auto &exec = Executor::get(0);
std::stringstream( argv[1] ) >> domainSize.x;
std::stringstream( argv[2] ) >> domainSize.y;
std::stringstream( argv[3] ) >> domainSize.z;
std::stringstream( argv[4] ) >> iterations;
// add ghost layers to be comparable to pystencils native backend
domainSize.x += 2;
domainSize.y += 2;
domainSize.z += 2;
int64_t totalSize = domainSize.x * domainSize.y * domainSize.z * {{f_size}};
std::vector<double> src( totalSize, 0.0 );
std::vector<double> dst( totalSize, 0.0 );
auto & dsrc = exec.allocate<double>(src.size());
auto & ddst = exec.allocate<double>(dst.size());
dsrc.upload(src.data(), src.size());
ddst.upload(dst.data(), dst.size());
double * _data_src = dsrc.get();
double * _data_dst = ddst.get();
const int64_t _size_src_0 = domainSize.x;
const int64_t _size_src_1 = domainSize.y;
const int64_t _size_src_2 = domainSize.z;
// fzyx layout
const int64_t _stride_src_0 = 1;
const int64_t _stride_src_1 = domainSize.x;
const int64_t _stride_src_2 = domainSize.x * domainSize.y;
const int64_t _stride_src_3 = domainSize.x * domainSize.y * domainSize.z;
auto pacxxKernel = [=]( range & config ) {
struct Vec3D {int x; int y; int z; };
const Vec3D blockDim = { config.get_block_size(0), config.get_block_size(1), config.get_block_size(2) };
const Vec3D blockIdx = { config.get_block(0), config.get_block(1), config.get_block(2) };
const Vec3D threadIdx = { config.get_local(0), config.get_local(1), config.get_local(2) };
{{ code|indent(8) }}
};
size_t blockSize[] = {64, 8, 1};
KernelConfiguration config( { division_round_up(domainSize.x - 2, blockSize[0]),
division_round_up(domainSize.y - 2, blockSize[1]),
division_round_up(domainSize.z -2, blockSize[2]) },
{ blockSize[0],
blockSize[1],
blockSize[2] });
// warm up
for( int64_t i = 0; i < 10; ++i ) {
exec.launch(pacxxKernel, config);
}
exec.synchronize();
auto start = std::chrono::high_resolution_clock::now();
for( int64_t i = 0; i < iterations; ++i ) {
exec.launch(pacxxKernel, config);
}
exec.synchronize();
auto duration = std::chrono::high_resolution_clock::now() - start;
auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration);
std::cout << ns.count() * 1e-9 << std::endl;
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment