From e9eb29b9358651e21f0a567d0bdc6a4ea04d2466 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Tue, 8 Jan 2019 16:13:03 +0100 Subject: [PATCH] PACXX benchmark generation --- pacxx/benchmark.py | 170 +++++++++++++++++++++++++++++++++++ pacxx/benchmark_template.cpp | 103 +++++++++++++++++++++ 2 files changed, 273 insertions(+) create mode 100644 pacxx/benchmark.py create mode 100644 pacxx/benchmark_template.cpp diff --git a/pacxx/benchmark.py b/pacxx/benchmark.py new file mode 100644 index 000000000..2d9b8962a --- /dev/null +++ b/pacxx/benchmark.py @@ -0,0 +1,170 @@ +import os +from time import perf_counter +import subprocess +from tempfile import TemporaryDirectory + +from pystencils import create_data_handling +from pystencils.backends.cbackend import CBackend +from jinja2 import Environment, FileSystemLoader +from pystencils.backends.cbackend import generate_c + +script_path = os.path.dirname(os.path.realpath(__file__)) +PAXX_ROOT = '/local/bauer/code/pacxx/install' +DEFAULT_PAXX_COMPILE_OPTIONS = ('-Ofast', '-march=native') + + +def generate_benchmark_code(target_file, kernel_ast, target): + assert target in ('cpu', 'gpu') + assert hasattr(kernel_ast, 'indexing'), "AST has to be a CUDA kernel in order to create a PACXX kernel from it" + backend = CBackend() + + function_body = kernel_ast.body + f_sizes = {f.shape[-1] for f in kernel_ast.fields_accessed} + assert len(f_sizes) == 1 + + env = Environment(loader=FileSystemLoader(script_path)) + result = env.get_template("benchmark_template.cpp").render(f_size=f_sizes.pop(), + code=backend(function_body), + target=target) + + with open(target_file, 'w') as f: + f.write(result) + + +def pacxx_compile(source, executable, options=DEFAULT_PAXX_COMPILE_OPTIONS): + command = ['pacxx++', *options, source, '-o', executable, ] + env = os.environ.copy() + env['PATH'] = "{}:{}".format(env.get('PATH', ''), os.path.join(PAXX_ROOT, 'bin')) + env['LD_LIBRARY_PATH'] = "{}:{}".format(env.get('LD_LIBRARY_PATH', ''), os.path.join(PAXX_ROOT, 'lib')) + try: + subprocess.check_output(command, env=env, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print(" ".join(command)) + print(e.output.decode('utf8')) + raise e + + +def run_paxx_benchmark(executable, domain_size, iterations): + assert len(domain_size) == 3 + arguments = [executable, *domain_size, iterations] + arguments = [str(e) for e in arguments] + output = subprocess.check_output(arguments) + return float(output) / iterations + + +def paxx_benchmark(ast, domain_size, iterations, target='cpu', compile_options=DEFAULT_PAXX_COMPILE_OPTIONS): + """Generates, compiles and runs the kernel with PAXX + + Args: + ast: pystencils AST object (has to be generated for CUDA, even when run on CPU with pacxx) + domain_size: x, y, z extent of spatial domain + iterations: number of outer iterations + target: either 'cpu' or 'gpu' to specify where pacxx should run the kernel + compile_options: compile options for pacxx + + Returns: + seconds for one outer iteration + """ + with TemporaryDirectory() as base_dir: + code = os.path.join(base_dir, 'code.cpp') + executable = os.path.join(base_dir, 'bench') + generate_benchmark_code(code, ast, target) + pacxx_compile(code, executable, compile_options) + time_per_iteration = run_paxx_benchmark(executable, domain_size, iterations) + return time_per_iteration + + +def lbm_performance_compare(domain_size, iterations, **lb_params): + """Runs benchmark with pacxx and with normal pystencils backends. + + Args: + domain_size: 3-tuple with size of spatial domain + iterations: number of outer iterations + **lb_params: parameters passed to lbmpy to choose lattice Boltzmann algorithm & optimization options + + Returns: + dictionary with measurements of time per iteration for different backends + """ + import pycuda.driver as drv + + from lbmpy.creationfunctions import create_lb_ast + if 'optimization' not in lb_params: + lb_params['optimization'] = {} + + lb_params['optimization']['target'] = 'cpu' + cpu_ast = create_lb_ast(**lb_params) + lb_params['optimization']['target'] = 'gpu' + gpu_ast = create_lb_ast(**lb_params) + + # print kernel code of CPU or GPU version - just for comparison, files are not used + with open("pystencils_cpu_code.c", 'w') as f: + print(generate_c(cpu_ast), file=f) + with open("pystencils_gpu_code.cu", 'w') as f: + print(generate_c(gpu_ast), file=f) + + cpu_kernel = cpu_ast.compile() + gpu_kernel = gpu_ast.compile() + f_sizes = {f.shape[-1] for f in cpu_ast.fields_accessed} + assert len(f_sizes) == 1 + f_size = f_sizes.pop() + + dh = create_data_handling(domain_size, default_target='gpu', default_layout='fzyx') + dh.add_array('src', values_per_cell=f_size) + dh.add_array('dst', values_per_cell=f_size) + dh.fill('src', 0) + dh.fill('dst', 0) + + # to keep it simple we run outer loop directly from Python + # make domain size large enough, otherwise we measure the python call overhead + def run_benchmark(kernel): + dh.all_to_gpu() + for i in range(10): # warmup + dh.run_kernel(kernel) + drv.Context.synchronize() + start = perf_counter() + for i in range(iterations): + dh.run_kernel(kernel) + drv.Context.synchronize() + return (perf_counter() - start) / iterations + + return { + 'pystencils_cpu': run_benchmark(cpu_kernel), + 'pystencils_gpu': run_benchmark(gpu_kernel), + 'pacxx_cpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='cpu'), + 'pacxx_gpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='gpu'), + } + + +if __name__ == '__main__': + no_opt = { + 'openmp': 8, # number of threads - pacxx uses also HT cores + 'split': False, + 'vectorization': False, + 'gpu_indexing_params': {'block_size': (64, 8, 1)}, + } + only_vectorization = { + 'openmp': 4, + 'split': False, + 'gpu_indexing_params': {'block_size': (64, 8, 1)}, + 'vectorization': {'instruction_set': 'avx', + 'assume_inner_stride_one': True, + 'nontemporal': False}, + } + best = { + 'openmp': 4, + 'split': True, + 'gpu_indexing_params': {'block_size': (64, 8, 1)}, + 'vectorization': {'instruction_set': 'avx', + 'assume_inner_stride_one': True, + 'nontemporal': True} + } + res = lbm_performance_compare(stencil='D3Q19', relaxation_rate=1.8, compressible=False, + domain_size=(512, 128, 32), iterations=500, + optimization=only_vectorization) + cpu_speedup = ((res['pacxx_cpu'] / res['pystencils_cpu']) - 1) * 100 + gpu_speedup = ((res['pacxx_gpu'] / res['pystencils_gpu']) - 1) * 100 + print("Time for one kernel call [s]") + for config_name, time in res.items(): + print(" {0: <16}: {1}".format(config_name, time)) + + print("CPU {:.02f}% GPU {:.02f}%".format(cpu_speedup, gpu_speedup)) diff --git a/pacxx/benchmark_template.cpp b/pacxx/benchmark_template.cpp new file mode 100644 index 000000000..153d677b5 --- /dev/null +++ b/pacxx/benchmark_template.cpp @@ -0,0 +1,103 @@ +#include <PACXX.h> +#include <vector> +#include <sstream> +#include <iostream> +#include <chrono> + + +using namespace pacxx::v2; + +size_t division_round_up(size_t a, size_t b) +{ + if( a % b == 0) + return a / b; + else + return (a / b) + 1; +} + +int main(int argc, char** argv) +{ + {% if target == 'cpu' %} + Executor::Create<NativeRuntime>(0); + {% elif target == 'gpu' %} + Executor::Create<CUDARuntime>(0); + {% endif %} + + if( argc != 5 ) { + std::cout << "Usage: ./benchmark xSize ySize zSize iterations" << std::endl; + return 1; + } + Dimension3 domainSize; + int64_t iterations; + auto &exec = Executor::get(0); + + std::stringstream( argv[1] ) >> domainSize.x; + std::stringstream( argv[2] ) >> domainSize.y; + std::stringstream( argv[3] ) >> domainSize.z; + std::stringstream( argv[4] ) >> iterations; + + // add ghost layers to be comparable to pystencils native backend + domainSize.x += 2; + domainSize.y += 2; + domainSize.z += 2; + + int64_t totalSize = domainSize.x * domainSize.y * domainSize.z * {{f_size}}; + + std::vector<double> src( totalSize, 0.0 ); + std::vector<double> dst( totalSize, 0.0 ); + + auto & dsrc = exec.allocate<double>(src.size()); + auto & ddst = exec.allocate<double>(dst.size()); + + dsrc.upload(src.data(), src.size()); + ddst.upload(dst.data(), dst.size()); + + double * _data_src = dsrc.get(); + double * _data_dst = ddst.get(); + + const int64_t _size_src_0 = domainSize.x; + const int64_t _size_src_1 = domainSize.y; + const int64_t _size_src_2 = domainSize.z; + + // fzyx layout + const int64_t _stride_src_0 = 1; + const int64_t _stride_src_1 = domainSize.x; + const int64_t _stride_src_2 = domainSize.x * domainSize.y; + const int64_t _stride_src_3 = domainSize.x * domainSize.y * domainSize.z; + + auto pacxxKernel = [=]( range & config ) { + + struct Vec3D {int x; int y; int z; }; + const Vec3D blockDim = { config.get_block_size(0), config.get_block_size(1), config.get_block_size(2) }; + const Vec3D blockIdx = { config.get_block(0), config.get_block(1), config.get_block(2) }; + const Vec3D threadIdx = { config.get_local(0), config.get_local(1), config.get_local(2) }; + + {{ code|indent(8) }} + }; + + size_t blockSize[] = {64, 8, 1}; + + KernelConfiguration config( { division_round_up(domainSize.x - 2, blockSize[0]), + division_round_up(domainSize.y - 2, blockSize[1]), + division_round_up(domainSize.z -2, blockSize[2]) }, + { blockSize[0], + blockSize[1], + blockSize[2] }); + + // warm up + for( int64_t i = 0; i < 10; ++i ) { + exec.launch(pacxxKernel, config); + } + exec.synchronize(); + + auto start = std::chrono::high_resolution_clock::now(); + for( int64_t i = 0; i < iterations; ++i ) { + exec.launch(pacxxKernel, config); + } + exec.synchronize(); + auto duration = std::chrono::high_resolution_clock::now() - start; + + auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration); + std::cout << ns.count() * 1e-9 << std::endl; + +} -- GitLab