Commit e9eb29b9 authored by Martin Bauer's avatar Martin Bauer
Browse files

PACXX benchmark generation

parent 649e82b4
import os
from time import perf_counter
import subprocess
from tempfile import TemporaryDirectory
from pystencils import create_data_handling
from pystencils.backends.cbackend import CBackend
from jinja2 import Environment, FileSystemLoader
from pystencils.backends.cbackend import generate_c
script_path = os.path.dirname(os.path.realpath(__file__))
PAXX_ROOT = '/local/bauer/code/pacxx/install'
DEFAULT_PAXX_COMPILE_OPTIONS = ('-Ofast', '-march=native')
def generate_benchmark_code(target_file, kernel_ast, target):
assert target in ('cpu', 'gpu')
assert hasattr(kernel_ast, 'indexing'), "AST has to be a CUDA kernel in order to create a PACXX kernel from it"
backend = CBackend()
function_body = kernel_ast.body
f_sizes = {f.shape[-1] for f in kernel_ast.fields_accessed}
assert len(f_sizes) == 1
env = Environment(loader=FileSystemLoader(script_path))
result = env.get_template("benchmark_template.cpp").render(f_size=f_sizes.pop(),
code=backend(function_body),
target=target)
with open(target_file, 'w') as f:
f.write(result)
def pacxx_compile(source, executable, options=DEFAULT_PAXX_COMPILE_OPTIONS):
command = ['pacxx++', *options, source, '-o', executable, ]
env = os.environ.copy()
env['PATH'] = "{}:{}".format(env.get('PATH', ''), os.path.join(PAXX_ROOT, 'bin'))
env['LD_LIBRARY_PATH'] = "{}:{}".format(env.get('LD_LIBRARY_PATH', ''), os.path.join(PAXX_ROOT, 'lib'))
try:
subprocess.check_output(command, env=env, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
print(" ".join(command))
print(e.output.decode('utf8'))
raise e
def run_paxx_benchmark(executable, domain_size, iterations):
assert len(domain_size) == 3
arguments = [executable, *domain_size, iterations]
arguments = [str(e) for e in arguments]
output = subprocess.check_output(arguments)
return float(output) / iterations
def paxx_benchmark(ast, domain_size, iterations, target='cpu', compile_options=DEFAULT_PAXX_COMPILE_OPTIONS):
"""Generates, compiles and runs the kernel with PAXX
Args:
ast: pystencils AST object (has to be generated for CUDA, even when run on CPU with pacxx)
domain_size: x, y, z extent of spatial domain
iterations: number of outer iterations
target: either 'cpu' or 'gpu' to specify where pacxx should run the kernel
compile_options: compile options for pacxx
Returns:
seconds for one outer iteration
"""
with TemporaryDirectory() as base_dir:
code = os.path.join(base_dir, 'code.cpp')
executable = os.path.join(base_dir, 'bench')
generate_benchmark_code(code, ast, target)
pacxx_compile(code, executable, compile_options)
time_per_iteration = run_paxx_benchmark(executable, domain_size, iterations)
return time_per_iteration
def lbm_performance_compare(domain_size, iterations, **lb_params):
"""Runs benchmark with pacxx and with normal pystencils backends.
Args:
domain_size: 3-tuple with size of spatial domain
iterations: number of outer iterations
**lb_params: parameters passed to lbmpy to choose lattice Boltzmann algorithm & optimization options
Returns:
dictionary with measurements of time per iteration for different backends
"""
import pycuda.driver as drv
from lbmpy.creationfunctions import create_lb_ast
if 'optimization' not in lb_params:
lb_params['optimization'] = {}
lb_params['optimization']['target'] = 'cpu'
cpu_ast = create_lb_ast(**lb_params)
lb_params['optimization']['target'] = 'gpu'
gpu_ast = create_lb_ast(**lb_params)
# print kernel code of CPU or GPU version - just for comparison, files are not used
with open("pystencils_cpu_code.c", 'w') as f:
print(generate_c(cpu_ast), file=f)
with open("pystencils_gpu_code.cu", 'w') as f:
print(generate_c(gpu_ast), file=f)
cpu_kernel = cpu_ast.compile()
gpu_kernel = gpu_ast.compile()
f_sizes = {f.shape[-1] for f in cpu_ast.fields_accessed}
assert len(f_sizes) == 1
f_size = f_sizes.pop()
dh = create_data_handling(domain_size, default_target='gpu', default_layout='fzyx')
dh.add_array('src', values_per_cell=f_size)
dh.add_array('dst', values_per_cell=f_size)
dh.fill('src', 0)
dh.fill('dst', 0)
# to keep it simple we run outer loop directly from Python
# make domain size large enough, otherwise we measure the python call overhead
def run_benchmark(kernel):
dh.all_to_gpu()
for i in range(10): # warmup
dh.run_kernel(kernel)
drv.Context.synchronize()
start = perf_counter()
for i in range(iterations):
dh.run_kernel(kernel)
drv.Context.synchronize()
return (perf_counter() - start) / iterations
return {
'pystencils_cpu': run_benchmark(cpu_kernel),
'pystencils_gpu': run_benchmark(gpu_kernel),
'pacxx_cpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='cpu'),
'pacxx_gpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='gpu'),
}
if __name__ == '__main__':
no_opt = {
'openmp': 8, # number of threads - pacxx uses also HT cores
'split': False,
'vectorization': False,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
}
only_vectorization = {
'openmp': 4,
'split': False,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
'vectorization': {'instruction_set': 'avx',
'assume_inner_stride_one': True,
'nontemporal': False},
}
best = {
'openmp': 4,
'split': True,
'gpu_indexing_params': {'block_size': (64, 8, 1)},
'vectorization': {'instruction_set': 'avx',
'assume_inner_stride_one': True,
'nontemporal': True}
}
res = lbm_performance_compare(stencil='D3Q19', relaxation_rate=1.8, compressible=False,
domain_size=(512, 128, 32), iterations=500,
optimization=only_vectorization)
cpu_speedup = ((res['pacxx_cpu'] / res['pystencils_cpu']) - 1) * 100
gpu_speedup = ((res['pacxx_gpu'] / res['pystencils_gpu']) - 1) * 100
print("Time for one kernel call [s]")
for config_name, time in res.items():
print(" {0: <16}: {1}".format(config_name, time))
print("CPU {:.02f}% GPU {:.02f}%".format(cpu_speedup, gpu_speedup))
#include <PACXX.h>
#include <vector>
#include <sstream>
#include <iostream>
#include <chrono>
using namespace pacxx::v2;
size_t division_round_up(size_t a, size_t b)
{
if( a % b == 0)
return a / b;
else
return (a / b) + 1;
}
int main(int argc, char** argv)
{
{% if target == 'cpu' %}
Executor::Create<NativeRuntime>(0);
{% elif target == 'gpu' %}
Executor::Create<CUDARuntime>(0);
{% endif %}
if( argc != 5 ) {
std::cout << "Usage: ./benchmark xSize ySize zSize iterations" << std::endl;
return 1;
}
Dimension3 domainSize;
int64_t iterations;
auto &exec = Executor::get(0);
std::stringstream( argv[1] ) >> domainSize.x;
std::stringstream( argv[2] ) >> domainSize.y;
std::stringstream( argv[3] ) >> domainSize.z;
std::stringstream( argv[4] ) >> iterations;
// add ghost layers to be comparable to pystencils native backend
domainSize.x += 2;
domainSize.y += 2;
domainSize.z += 2;
int64_t totalSize = domainSize.x * domainSize.y * domainSize.z * {{f_size}};
std::vector<double> src( totalSize, 0.0 );
std::vector<double> dst( totalSize, 0.0 );
auto & dsrc = exec.allocate<double>(src.size());
auto & ddst = exec.allocate<double>(dst.size());
dsrc.upload(src.data(), src.size());
ddst.upload(dst.data(), dst.size());
double * _data_src = dsrc.get();
double * _data_dst = ddst.get();
const int64_t _size_src_0 = domainSize.x;
const int64_t _size_src_1 = domainSize.y;
const int64_t _size_src_2 = domainSize.z;
// fzyx layout
const int64_t _stride_src_0 = 1;
const int64_t _stride_src_1 = domainSize.x;
const int64_t _stride_src_2 = domainSize.x * domainSize.y;
const int64_t _stride_src_3 = domainSize.x * domainSize.y * domainSize.z;
auto pacxxKernel = [=]( range & config ) {
struct Vec3D {int x; int y; int z; };
const Vec3D blockDim = { config.get_block_size(0), config.get_block_size(1), config.get_block_size(2) };
const Vec3D blockIdx = { config.get_block(0), config.get_block(1), config.get_block(2) };
const Vec3D threadIdx = { config.get_local(0), config.get_local(1), config.get_local(2) };
{{ code|indent(8) }}
};
size_t blockSize[] = {64, 8, 1};
KernelConfiguration config( { division_round_up(domainSize.x - 2, blockSize[0]),
division_round_up(domainSize.y - 2, blockSize[1]),
division_round_up(domainSize.z -2, blockSize[2]) },
{ blockSize[0],
blockSize[1],
blockSize[2] });
// warm up
for( int64_t i = 0; i < 10; ++i ) {
exec.launch(pacxxKernel, config);
}
exec.synchronize();
auto start = std::chrono::high_resolution_clock::now();
for( int64_t i = 0; i < iterations; ++i ) {
exec.launch(pacxxKernel, config);
}
exec.synchronize();
auto duration = std::chrono::high_resolution_clock::now() - start;
auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration);
std::cout << ns.count() * 1e-9 << std::endl;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment