Commit 93ebe096 authored by Martin Bauer's avatar Martin Bauer
Browse files

New Time Loop - makes small block scenarios faster

- new time loop caches all kernel functions with their argument dict
  -> inner loop just calls C functions with cached kwargs
parent b5f4dc95
......@@ -211,6 +211,23 @@ class BoundaryHandling:
self._boundary_object_to_boundary_info[b_obj].kernel(**kwargs)
def add_fixed_steps(self, fixed_loop, **kwargs):
if self._dirty:
self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
arguments = kwargs.copy()
arguments[self._field_name] = b[self._field_name]
arguments['indexField'] = idx_arr
data_used_in_kernel = (p.field_name
for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters
if p.is_field_ptr_argument and p.field_name not in arguments)
arguments.update({name: b[name] for name in data_used_in_kernel if name not in arguments})
kernel = self._boundary_object_to_boundary_info[b_obj].kernel
fixed_loop.add_call(kernel, arguments)
def geometry_to_vtk(self, file_name='geometry', boundaries='all', ghost_layers=False):
"""
Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0
......
......@@ -50,7 +50,6 @@ import platform
import shutil
import textwrap
import numpy as np
import functools
import subprocess
from appdirs import user_config_dir, user_cache_dir
from collections import OrderedDict
......
import numpy as np
import warnings
from pystencils import Field
from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.datahandling.blockiteration import sliced_block_iteration, block_iteration
from pystencils.utils import DotDict
# noinspection PyPep8Naming
import waLBerla as wlb
import warnings
class ParallelDataHandling(DataHandling):
......@@ -16,14 +16,15 @@ class ParallelDataHandling(DataHandling):
"""
Creates data handling based on walberla block storage
:param blocks: walberla block storage
:param default_ghost_layers: nr of ghost layers used if not specified in add() method
:param default_layout: layout used if no layout is given to add() method
:param dim: dimension of scenario,
walberla always uses three dimensions, so if dim=2 the extend of the
z coordinate of blocks has to be 1
:param default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
if not overwritten in add_array, and synchronization functions are for the GPU by default
Args:
blocks: walberla block storage
default_ghost_layers: nr of ghost layers used if not specified in add() method
default_layout: layout used if no layout is given to add() method
dim: dimension of scenario,
walberla always uses three dimensions, so if dim=2 the extend of the
z coordinate of blocks has to be 1
default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
if not overwritten in add_array, and synchronization functions are for the GPU by default
"""
super(ParallelDataHandling, self).__init__()
assert dim in (2, 3)
......@@ -135,8 +136,6 @@ class ParallelDataHandling(DataHandling):
self._field_name_to_cpu_data_name[name] = name
if gpu:
self._field_name_to_gpu_data_name[name] = self.GPU_DATA_PREFIX + name
self._rebuild_data_cache()
return self.fields[name]
def has_data(self, name):
......@@ -156,15 +155,8 @@ class ParallelDataHandling(DataHandling):
def swap(self, name1, name2, gpu=False):
if gpu:
for d in self._data_cache_gpu:
d[name1], d[name2] = d[name2], d[name1]
name1 = self.GPU_DATA_PREFIX + name1
name2 = self.GPU_DATA_PREFIX + name2
else:
for d in self._data_cache_cpu:
d[name1], d[name2] = d[name2], d[name1]
for block in self.blocks:
block[name1].swapDataPointers(block[name2])
......@@ -222,31 +214,31 @@ class ParallelDataHandling(DataHandling):
arr = arr[:, :, 0]
return arr
def _rebuild_data_cache(self):
self._data_cache_cpu = []
self._data_cache_gpu = []
elements = [(self._data_cache_cpu, wlb.field.toArray, self._field_name_to_cpu_data_name)]
if self._field_name_to_gpu_data_name:
elements.append((self._data_cache_gpu, wlb.cuda.toGpuArray, self._field_name_to_gpu_data_name))
for cache, to_array, name_to_data_name in elements:
for block in self.blocks:
field_args = {}
for field_name, data_name in name_to_data_name.items():
field = self.fields[field_name]
arr = to_array(block[data_name], withGhostLayers=[True, True, self.dim == 3])
arr = self._normalize_arr_shape(arr, field.index_dimensions)
field_args[field_name] = arr
cache.append(field_args)
def run_kernel(self, kernel_function, **kwargs):
for arg_dict in self.get_kernel_kwargs(kernel_function, **kwargs):
kernel_function(**arg_dict)
def get_kernel_kwargs(self, kernel_function, **kwargs):
if kernel_function.ast.backend == 'gpucuda':
for d in self._data_cache_gpu:
kernel_function(**d, **kwargs)
name_map = self._field_name_to_gpu_data_name
to_array = wlb.cuda.toGpuArray
else:
for d in self._data_cache_cpu:
kernel_function(**d, **kwargs)
name_map = self._field_name_to_cpu_data_name
to_array = wlb.field.toArray
data_used_in_kernel = [(name_map[p.field_name], self.fields[p.field_name])
for p in kernel_function.parameters if
p.is_field_ptr_argument and p.field_name not in kwargs]
result = []
for block in self.blocks:
field_args = {}
for data_name, f in data_used_in_kernel:
arr = to_array(block[data_name], withGhostLayers=[True, True, self.dim == 3])
arr = self._normalize_arr_shape(arr, f.index_dimensions)
field_args[f.name] = arr
field_args.update(kwargs)
result.append(field_args)
return result
def to_cpu(self, name):
if name in self._custom_data_transfer_functions:
......
......@@ -211,6 +211,12 @@ class SerialDataHandling(DataHandling):
arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays
kernel_function(**arrays, **kwargs)
def get_kernel_kwargs(self, kernel_function, **kwargs):
result = {}
result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays)
result.update(kwargs)
return [result]
def to_cpu(self, name):
if name in self._custom_data_transfer_functions:
transfer_func = self._custom_data_transfer_functions[name][1]
......
import time
from pystencils.integer_functions import modulo_ceil
class TimeLoop:
def __init__(self):
self._preRunFunctions = []
self._postRunFunctions = []
self._timeStepFunctions = []
self._functionNames = []
def __init__(self, steps=2):
self._call_data = []
self._fixed_steps = steps
self._pre_run_functions = []
self._post_run_functions = []
self._single_step_functions = []
self.time_steps_run = 0
def add_step(self, step_obj):
if hasattr(step_obj, 'pre_run'):
self.add_pre_run_function(step_obj.pre_run)
if hasattr(step_obj, 'post_run'):
self.add_post_run_function(step_obj.post_run)
self.add(step_obj.time_step, step_obj.name)
def add(self, time_step_function, name=None):
if name is None:
name = str(time_step_function)
self._timeStepFunctions.append(time_step_function)
self._functionNames.append(name)
def add_kernel(self, data_handling, kernel_func, name=None):
self.add(lambda: data_handling.run_kernel(kernel_func), name)
@property
def fixed_steps(self):
return self._fixed_steps
def add_pre_run_function(self, f):
self._preRunFunctions.append(f)
self._pre_run_functions.append(f)
def add_post_run_function(self, f):
self._postRunFunctions.append(f)
self._post_run_functions.append(f)
def add_single_step_function(self, f):
self._single_step_functions.append(f)
def add_call(self, functor, argument_list):
if hasattr(functor, 'kernel'):
functor = functor.kernel
if not isinstance(argument_list, list):
argument_list = [argument_list]
for argument_dict in argument_list:
self._call_data.append((functor, argument_dict))
def pre_run(self):
for f in self._pre_run_functions:
f()
def post_run(self):
for f in self._post_run_functions:
f()
def run(self, time_steps=1):
self.pre_run()
fixed_steps = self._fixed_steps
call_data = self._call_data
main_iterations, rest_iterations = divmod(time_steps, fixed_steps)
try:
for i in range(time_steps):
self.time_step()
for _ in range(main_iterations):
for func, kwargs in call_data:
func(**kwargs)
self.time_steps_run += fixed_steps
for _ in range(rest_iterations):
for func in self._single_step_functions:
func()
self.time_steps_run += 1
except KeyboardInterrupt:
pass
self.post_run()
def benchmark_run(self, time_steps=0, init_time_steps=0):
init_time_steps_rounded = modulo_ceil(init_time_steps, self._fixed_steps)
time_steps_rounded = modulo_ceil(time_steps, self._fixed_steps)
self.pre_run()
for i in range(init_time_steps):
self.time_step()
for i in range(init_time_steps_rounded // self._fixed_steps):
for func, kwargs in self._call_data:
func(**kwargs)
self.time_steps_run += init_time_steps_rounded
start = time.perf_counter()
for i in range(time_steps):
self.time_step()
for i in range(time_steps_rounded // self._fixed_steps):
for func, kwargs in self._call_data:
func(**kwargs)
end = time.perf_counter()
self.time_steps_run += time_steps_rounded
self.post_run()
time_for_one_iteration = (end - start) / time_steps
......@@ -61,10 +86,12 @@ class TimeLoop:
self.pre_run()
start = time.perf_counter()
while time.perf_counter() < start + seconds:
self.time_step()
iterations += 1
for func, kwargs in self._call_data:
func(**kwargs)
iterations += self._fixed_steps
end = time.perf_counter()
self.post_run()
self.time_steps_run += iterations
return iterations, end - start
def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'):
......@@ -88,16 +115,3 @@ class TimeLoop:
time_steps = int(time_for_benchmark / duration_of_time_step)
time_steps = max(time_steps, 4)
return self.benchmark_run(time_steps, init_time_steps)
def pre_run(self):
for f in self._preRunFunctions:
f()
def post_run(self):
for f in self._postRunFunctions:
f()
def time_step(self):
for f in self._timeStepFunctions:
f()
self.time_steps_run += 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment