Commit b4b0cb4b authored by Christoph Schwarzmeier's avatar Christoph Schwarzmeier
Browse files

Merge branch 'UpdatePhaseField' into 'master'

Update phase field

See merge request !477
parents 3455bf3e 40e80f3e
Pipeline #33777 passed with stages
in 565 minutes and 16 seconds
......@@ -3,30 +3,43 @@ import waLBerla as wlb
import pandas as pd
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from waLBerla.tools.config import block_decomposition
import sys
from math import prod
def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
"""Checks if a single block of given size fits into GPU memory"""
cells = prod(b + 2 * gls for b in block_size)
# 3 values for the velocity and two for the phase field and the temporary phase field
values_per_cell = 2 * q_phase + 2 * q_hydro + 3 + 2
needed_memory = values_per_cell * cells * size_per_value
return needed_memory < total_mem
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 101
edge_size = 32
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.cells_per_block = cells_per_block
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.size = (self.cells_per_block[0] * self.blocks[0],
self.cells_per_block[1] * self.blocks[1],
self.cells_per_block[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.timeStepStrategy = time_step_strategy
self.cuda_block_size = cuda_block_size
self.warmupSteps = 10
self.cudaEnabledMpi = cuda_enabled_mpi
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
......@@ -41,19 +54,18 @@ class Scenario:
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
'cudaEnabledMpi': self.cudaEnabledMpi
},
'Boundaries_GPU': {
'Border': []
......@@ -86,43 +98,46 @@ class Scenario:
df.to_csv(self.csv_file, index=False, mode='a', header=False)
def overlap_benchmark():
def benchmark():
scenarios = wlb.ScenarioManager()
overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 4, 2), (64, 4, 2), (128, 2, 2),
(32, 8, 1), (64, 8, 1), (64, 4, 2),
(32, 16, 1), (16, 16, 1), (16, 16, 2)]
# no overlap
scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
# overlap
for overlap_strategy in ['overlap']:
for overlappingWidth in overlappingWidths:
for cuda_block in cuda_blocks:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=overlappingWidth,
cuda_block_size=cuda_block)
scenarios.add(scenario)
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_size = (256, 256, 256)
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
else:
scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
# overlap
# for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
for overlap_strategy in ['overlap']:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=(1, 1, 1),
cuda_block_size=(128, 1, 1))
scenarios.add(scenario)
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2),
(32, 8, 1),
(16, 16, 1)]
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for cuda_block in cuda_blocks:
for block_size in block_sizes:
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
continue
scenario = Scenario(time_step_strategy=time_step_strategy,
cuda_block_size=cuda_block,
cells_per_block=block_size)
scenarios.add(scenario)
# overlap_benchmark()
# benchmark()
kernel_benchmark()
......@@ -3,57 +3,55 @@ import waLBerla as wlb
import pandas as pd
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from waLBerla.tools.config import block_decomposition
import sys
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth):
def __init__(self, time_step_strategy, cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = -1
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 500
edge_size = 50
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.timesteps = 101
self.cells_per_block = cells_per_block
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.size = (self.cells_per_block[0] * self.blocks[0],
self.cells_per_block[1] * self.blocks[1],
self.cells_per_block[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.cuda_block_size = (-1, -1, -1)
self.timeStepStrategy = time_step_strategy
self.warmupSteps = 10
self.cudaEnabledMpi = cuda_enabled_mpi
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
self.scenario = 1 # 1 rising bubble, 2 RTI
self.config_dict = self.config()
self.csv_file = "benchmark_cpu.csv"
self.csv_file = "benchmark.csv"
@wlb.member_callback
def config(self):
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': 10.0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
'scenario': 1,
'cudaEnabledMpi': self.cudaEnabledMpi
},
'Boundaries_GPU': {
'Border': []
......@@ -86,31 +84,23 @@ class Scenario:
df.to_csv(self.csv_file, index=False, mode='a', header=False)
def overlap_benchmark():
def benchmark():
scenarios = wlb.ScenarioManager()
overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32),
(64, 32, 32), (64, 64, 32), (64, 64, 64)]
# no overlap
scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1)))
# overlap
for overlap_strategy in ['overlap']:
for overlappingWidth in overlappingWidths:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=overlappingWidth)
scenarios.add(scenario)
block_size = (64, 64, 64)
scenarios.add(Scenario(time_step_strategy='normal', cells_per_block=block_size))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
# overlap
scenario = Scenario(timeStepStrategy='overlap',
overlappingWidth=(8, 8, 8))
scenarios.add(scenario)
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for block_size in block_sizes:
scenario = Scenario(time_step_strategy=time_step_strategy,
cells_per_block=block_size)
scenarios.add(scenario)
# overlap_benchmark()
# benchmark()
kernel_benchmark()
......@@ -37,8 +37,6 @@
#include "timeloop/SweepTimeloop.h"
#include <blockforest/communication/UniformBufferedScheme.h>
#include "InitializerFunctions.h"
//////////////////////////////
......@@ -48,40 +46,19 @@
#if defined(WALBERLA_BUILD_WITH_CUDA)
# include "cuda/AddGPUFieldToStorage.h"
# include "cuda/DeviceSelectMPI.h"
# include "cuda/HostFieldAllocator.h"
# include "cuda/ParallelStreams.h"
# include "cuda/communication/GPUPackInfo.h"
# include "cuda/communication/MemcpyPackInfo.h"
# include "cuda/communication/UniformGPUScheme.h"
# include "GenDefines.h"
# include "PackInfo_phase_field.h"
# include "PackInfo_phase_field_distributions.h"
# include "PackInfo_velocity_based_distributions.h"
# include "hydro_LB_step.h"
# include "initialize_phase_field_distributions.h"
# include "initialize_velocity_based_distributions.h"
# include "phase_field_LB_step.h"
#else
# include "GenDefines.h"
# include "PackInfo_phase_field.h"
# include "PackInfo_phase_field_distributions.h"
# include "PackInfo_velocity_based_distributions.h"
# include "hydro_LB_step.h"
# include "initialize_phase_field_distributions.h"
# include "initialize_velocity_based_distributions.h"
# include "phase_field_LB_step.h"
# include <blockforest/communication/UniformBufferedScheme.h>
#endif
#include "GenDefines.h"
using namespace walberla;
using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
using VelocityField_T = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
using PhaseField_T = GhostLayerField< real_t, 1 >;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField< flag_t >;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField< flag_t >;
#if defined(WALBERLA_BUILD_WITH_CUDA)
typedef cuda::GPUField< real_t > GPUField;
......@@ -111,9 +88,7 @@ int main(int argc, char** argv)
const real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
Vector3< int > overlappingWidth =
parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
#if defined(WALBERLA_BUILD_WITH_CUDA)
// CPU fields
......@@ -165,24 +140,21 @@ int main(int argc, char** argv)
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1],
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
gpuBlockSize[1],
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
gpuBlockSize[1], gpuBlockSize[2]);
#else
pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field, phase_field, vel_field, Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field,
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
#endif
// add communication
#if defined(WALBERLA_BUILD_WITH_CUDA)
const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
auto Comm_velocity_based_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
......@@ -190,13 +162,12 @@ int main(int argc, char** argv)
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
auto Comm_phase_field_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_phase_field_distributions =
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
#else
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
......@@ -244,29 +215,15 @@ int main(int argc, char** argv)
auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
{
Comm_phase_field_distributions->communicate(nullptr);
phase_field_LB_step(&block);
phase_field_LB_step(&block, defaultStream);
Comm_velocity_based_distributions->wait(defaultStream);
Comm_velocity_based_distributions->communicate(nullptr);
hydro_LB_step(&block);
}
};
auto simpleOverlapTimeStep = [&]() {
Comm_phase_field_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
phase_field_LB_step.inner(&block, defaultStream);
hydro_LB_step(&block, defaultStream);
Comm_phase_field_distributions->wait(defaultStream);
for (auto& block : *blocks)
phase_field_LB_step.outer(&block, defaultStream);
Comm_velocity_based_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
hydro_LB_step.inner(&block, defaultStream);
Comm_velocity_based_distributions->wait(defaultStream);
for (auto& block : *blocks)
hydro_LB_step.outer(&block, defaultStream);
};
auto phase_only = [&]() {
for (auto& block : *blocks)
......@@ -284,29 +241,15 @@ int main(int argc, char** argv)
};
#else
auto normalTimeStep = [&]() {
for (auto& block : *blocks)
{
Comm_phase_field_distributions();
phase_field_LB_step(&block);
Comm_velocity_based_distributions();
hydro_LB_step(&block);
}
};
auto simpleOverlapTimeStep = [&]() {
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step.inner(&block);
Comm_phase_field_distributions.wait();
for (auto& block : *blocks)
phase_field_LB_step.outer(&block);
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step.inner(&block);
Comm_velocity_based_distributions.wait();
for (auto& block : *blocks)
hydro_LB_step.outer(&block);
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block);
Comm_velocity_based_distributions.wait();
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block);
Comm_phase_field_distributions.wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
......@@ -324,12 +267,7 @@ int main(int argc, char** argv)
};
#endif
std::function< void() > timeStep;
if (timeStepStrategy == "overlap")
{
timeStep = std::function< void() >(simpleOverlapTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
}
else if (timeStepStrategy == "phase_only")
if (timeStepStrategy == "phase_only")
{
timeStep = std::function< void() >(phase_only);
WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
......@@ -347,7 +285,7 @@ int main(int argc, char** argv)
else
{
timeStep = std::function< void() >(normalTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
}
timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
......@@ -364,10 +302,8 @@ int main(int argc, char** argv)
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0);
#if defined(WALBERLA_BUILD_WITH_CUDA)
vtkOutput->addBeforeFunction([&]() {
cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
// cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
});
vtkOutput->addBeforeFunction(
[&]() { cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
#endif
auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
vtkOutput->addCellDataWriter(phaseWriter);
......@@ -402,6 +338,11 @@ int main(int argc, char** argv)
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
#if defined(WALBERLA_BUILD_WITH_CUDA)
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
#endif
// Call Python function to report results
pythonCallbackResults();
}
......
import os
import waLBerla as wlb
import csv
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
import sys
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
# output frequencies
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 201
edge_size = 100
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.cuda_block_size = cuda_block_size
self.warmupSteps = 20
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
self.scenario = 1 # 1 rising bubble, 2 RTI
self.config_dict = self.config()
self.csv_file = "benchmark_piz_daint.csv"
@wlb.member_callback
def config(self):
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
},
'Boundaries_GPU': {
'Border': []
},
'Boundaries_CPU': {
'Border': []
},
'Bubble': {
'bubbleMidPoint': self.bubbleMidPoint,
'bubbleRadius': self.bubbleRadius,
},
}
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}