From 1b78979c1930b8076c153151bc5d037e07d1ec85 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Fri, 4 Oct 2019 15:53:10 +0200
Subject: [PATCH] Generated UniformGrid Benchmark - two field version

---
 .../UniformGridGenerated/CMakeLists.txt       |  6 +-
 .../UniformGridGenerated/InitShearVelocity.h  | 33 +++++++++
 .../UniformGridGenerated/UniformGrid.prm      | 26 +++++++
 .../UniformGridGenerated.cpp                  | 74 +++++++------------
 .../UniformGridGenerated.py                   | 44 ++++++-----
 5 files changed, 115 insertions(+), 68 deletions(-)
 create mode 100644 apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
 create mode 100644 apps/benchmarks/UniformGridGenerated/UniformGrid.prm

diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
index def7d93f6..1e8028d39 100644
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
@@ -1,6 +1,8 @@
+waLBerla_link_files_to_builddir( "*.prm" )
+
+
 waLBerla_python_file_generates(UniformGridGenerated.py
-        UniformGridGenerated_LatticeModel.cpp
-        UniformGridGenerated_Defines.h)
+        GenLbKernel.cpp GenMacroGetter.cpp GenMacroSetter.cpp GenPackInfo.cpp GenDefines.h)
 
 
 foreach(config trt )
diff --git a/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h b/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
new file mode 100644
index 000000000..2aed66b1a
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
@@ -0,0 +1,33 @@
+#include "core/math/Random.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+
+namespace walberla {
+
+
+inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
+                              const real_t xMagnitude=0.005, const real_t fluctuationMagnitude=0.05 )
+{
+    math::seedRandomGenerator(0);
+    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
+    for( auto & block: *blocks)
+    {
+        auto velField = block.getData<GhostLayerField<real_t, 3> >( velFieldID );
+        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
+                                                         Cell globalCell;
+        blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        velField->get(x, y, z, 1) = real_t(0);
+        velField->get(x, y, z, 2) = randomReal;
+
+        if( globalCell[2] >= halfZ ) {
+            velField->get(x, y, z, 0) = xMagnitude;
+        } else {
+            velField->get(x, y, z, 0) = -xMagnitude;
+        }
+        );
+    }
+}
+
+
+}
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
new file mode 100644
index 000000000..622515557
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
@@ -0,0 +1,26 @@
+DomainSetup
+{
+   blocks        <  1,    1,   1 >;
+   cellsPerBlock <  64, 64, 128 >;
+   periodic      <  1,    1,   1 >;
+}
+
+Parameters 
+{
+
+	timesteps       200;   // time steps of one performance measurement
+	warmupSteps     10;    // number of steps to run before measurement starts
+    outerIterations 15;      // how many measurements to conduct
+
+	vtkWriteFrequency 200;             // write a VTK file every n'th step, if zero VTK output is disabled
+	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
+
+	timeStepStrategy kernelOnly;    // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	innerOuterSplit < 8, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
+
+	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
+
+	omega 1.8;
+	initShearFlow 1;
+	useGui 0;
+}
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
index f48b0f755..69db6180b 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -6,52 +6,26 @@
 #include "python_coupling/DictWrapper.h"
 #include "blockforest/Initialization.h"
 #include "field/vtk/VTKWriter.h"
+#include "field/AddToStorage.h"
 #include "field/communication/PackInfo.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "timeloop/all.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
 #include "domain_decomposition/SharedSweep.h"
-#include "lbm/communication/PdfFieldPackInfo.h"
-#include "lbm/field/AddToStorage.h"
-#include "lbm/vtk/VTKOutput.h"
-#include "lbm/gui/Connection.h"
-#include "lbm/vtk/Velocity.h"
 #include "gui/Gui.h"
+#include "InitShearVelocity.h"
 
-#include "UniformGridGenerated_LatticeModel.h"
-#include "UniformGridGenerated_Defines.h"
-
+#include "GenDefines.h"
+#include "GenPackInfo.h"
+#include "GenLbKernel.h"
+#include "GenMacroGetter.h"
+#include "GenMacroSetter.h"
 
 using namespace walberla;
 
-typedef lbm::UniformGridGenerated_LatticeModel LatticeModel_T;
-typedef LatticeModel_T::Stencil                Stencil_T;
-typedef LatticeModel_T::CommunicationStencil   CommunicationStencil_T;
-typedef lbm::PdfField< LatticeModel_T >        PdfField_T;
-
-
-void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldId,
-                       const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
-{
-    math::seedRandomGenerator(0);
-    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
-    for( auto & block: *blocks)
-    {
-        auto pdfField = block.getData<PdfField_T>( pdfFieldId );
-        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
-            Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
-
-            if( globalCell[2] >= halfZ ) {
-                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
-            } else {
-                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0, randomReal), real_t(1.0));
-            }
-        );
-    }
-}
+using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
+using VelocityField_T = GhostLayerField< real_t, 3 >;
 
 
 int main( int argc, char **argv )
@@ -72,22 +46,25 @@ int main( int argc, char **argv )
       const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
       const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
       const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
-      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
+      const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.08);
 
       // Creating fields
-      LatticeModel_T latticeModel = LatticeModel_T( omega );
-      BlockDataID pdfFieldId = lbm::addPdfFieldToStorage( blocks, "pdf field", latticeModel);
+      BlockDataID pdfFieldId = field::addToStorage< PdfField_T >( blocks, "pdfs", real_t( std::nan("") ), field::fzyx );
+      BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
 
-      if( initShearFlow ) {
-          initShearVelocity(blocks, pdfFieldId);
-      }
+      pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
+      pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
+
+      initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
+      for( auto & b : *blocks)
+          setterKernel(&b);
 
       SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
-      blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > communication( blocks );
-      communication.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) );
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication( blocks );
+      communication.addPackInfo( make_shared< pystencils::GenPackInfo >( pdfFieldId ) );
 
       timeLoop.add() << BeforeFunction( communication, "communication" )
-                     << Sweep( LatticeModel_T::Sweep( pdfFieldId ), "LB stream & collide" );
+                     << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide" );
 
       int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
       int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
@@ -106,8 +83,12 @@ int main( int argc, char **argv )
       {
           auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                            "simulation_step", false, true, true, false, 0 );
-          auto velWriter = make_shared< lbm::VelocityVTKWriter<LatticeModel_T> >(pdfFieldId, "vel");
-          vtkOutput->addCellDataWriter(velWriter);
+          auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
+          vtkOutput->addCellDataWriter( velWriter );
+          vtkOutput->addBeforeFunction( [&]()
+                                        { for( auto & b : *blocks)
+                                            getterKernel(&b);
+                                        } );
           timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
       }
 
@@ -116,7 +97,6 @@ int main( int argc, char **argv )
       if( useGui )
       {
           GUI gui( timeLoop, blocks, argc, argv);
-          lbm::connectToGui<LatticeModel_T>(gui);
           gui.run();
       }
       else
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
index 8f7bf791c..7027cf7ee 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -1,9 +1,9 @@
 import sympy as sp
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.creationfunctions import create_lb_update_rule
 from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
-from lbmpy_walberla import generate_lattice_model
-from pystencils_walberla import CodeGeneration
+from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
 
 omega = sp.symbols("omega")
 omega_fill = sp.symbols("omega_:10")
@@ -81,8 +81,9 @@ with CodeGeneration() as ctx:
         'field_name': 'pdfs',
         'temporary_field_name': 'pdfs_tmp',
         'kernel_type': accessor,
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+        'optimization': {'cse_global': False,
+                         'cse_pdfs': True,
+                         'split': True}
     }
     config_name = ctx.config
     noopt = False
@@ -94,6 +95,8 @@ with CodeGeneration() as ctx:
         d3q27 = True
         config_name = config_name[:-len("_d3q27")]
 
+    if config_name == '':
+        config_name = 'trt'
     options = options_dict[config_name]
     options.update(common_options)
     options = options.copy()
@@ -109,20 +112,22 @@ with CodeGeneration() as ctx:
     pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
     options['optimization']['symbolic_field'] = pdfs
 
-    vp = [
-        ('double', 'omega_0'),
-        ('double', 'omega_1'),
-        ('double', 'omega_2'),
-        ('double', 'omega_3'),
-        ('double', 'omega_4'),
-        ('double', 'omega_5'),
-        ('double', 'omega_6'),
-        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1'),
-    ]
-    update_rule = create_lb_collision_rule(**options)
-    generate_lattice_model(ctx, 'UniformGridGenerated_LatticeModel', update_rule)
+    update_rule = create_lb_update_rule(**options)
+    vec = {'nontemporal': True, 'assume_aligned': True, 'assume_inner_stride_one': True}
 
+    # Sweeps
+    generate_sweep(ctx, 'GenLbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')])
+    setter_assignments = macroscopic_values_setter(update_rule.method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=1)
+    getter_assignments = macroscopic_values_getter(update_rule.method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector,  density=None)
+    generate_sweep(ctx, 'GenMacroSetter', setter_assignments)
+    generate_sweep(ctx, 'GenMacroGetter', getter_assignments)
+
+    # Communication
+    generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule, cpu_vectorize_info={'instruction_set': None})
+
+    # Info Header
     infoHeaderParams = {
         'stencil': stencil_str,
         'q': q,
@@ -130,4 +135,5 @@ with CodeGeneration() as ctx:
         'cse_global': int(options['optimization']['cse_global']),
         'cse_pdfs': int(options['optimization']['cse_pdfs']),
     }
-    ctx.write_file("UniformGridGenerated_Defines.h", info_header.format(**infoHeaderParams))
+    ctx.write_file("GenDefines.h", info_header.format(**infoHeaderParams))
+
-- 
GitLab