From c933bf255788551f28740370ec0b1e1653896e90 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Sat, 5 Oct 2019 10:50:32 +0200
Subject: [PATCH] Corrections in UniformGridGenerated Benchmark

---
 .../UniformGridGenerated/CMakeLists.txt       |  5 +-
 .../UniformGridGenerated/UniformGrid.prm      | 16 ++---
 .../UniformGridGenerated.cpp                  | 64 +++++++++++++++----
 .../UniformGridGenerated.py                   | 41 +++++++-----
 4 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
index 1e8028d39..836baeefa 100644
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
@@ -2,7 +2,10 @@ waLBerla_link_files_to_builddir( "*.prm" )
 
 
 waLBerla_python_file_generates(UniformGridGenerated.py
-        GenLbKernel.cpp GenMacroGetter.cpp GenMacroSetter.cpp GenPackInfo.cpp GenDefines.h)
+        GenMacroGetter.cpp GenMacroSetter.cpp
+        GenPackInfo.cpp GenPackInfoAAPush.cpp GenPackInfoAAPull.cpp
+        GenLbKernel.cpp GenLbKernelAAEven.cpp GenLbKernelAAOdd.cpp
+        GenDefines.h)
 
 
 foreach(config trt )
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
index 622515557..ae22ab6b5 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
@@ -1,26 +1,22 @@
 DomainSetup
 {
    blocks        <  1,    1,   1 >;
-   cellsPerBlock <  64, 64, 128 >;
+   cellsPerBlock <  256, 128, 128 >;
    periodic      <  1,    1,   1 >;
 }
 
 Parameters 
 {
 
-	timesteps       200;   // time steps of one performance measurement
-	warmupSteps     10;    // number of steps to run before measurement starts
-    outerIterations 15;      // how many measurements to conduct
+	timesteps       400;   // time steps of one performance measurement
+	warmupSteps     1;    // number of steps to run before measurement starts
+    outerIterations 1;      // how many measurements to conduct
 
-	vtkWriteFrequency 200;             // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 0;           // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
-
-	timeStepStrategy kernelOnly;    // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
-	innerOuterSplit < 8, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
-
+	timeStepMode aaKernelOnly;                 // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
 	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
 
 	omega 1.8;
-	initShearFlow 1;
 	useGui 0;
 }
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
index 69db6180b..8ccfa107d 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -1,13 +1,11 @@
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
-#include "core/math/Random.h"
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 #include "python_coupling/DictWrapper.h"
 #include "blockforest/Initialization.h"
 #include "field/vtk/VTKWriter.h"
 #include "field/AddToStorage.h"
-#include "field/communication/PackInfo.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "timeloop/all.h"
 #include "core/timing/TimingPool.h"
@@ -17,11 +15,18 @@
 #include "InitShearVelocity.h"
 
 #include "GenDefines.h"
-#include "GenPackInfo.h"
-#include "GenLbKernel.h"
 #include "GenMacroGetter.h"
 #include "GenMacroSetter.h"
 
+#include "GenLbKernel.h"
+#include "GenLbKernelAAEven.h"
+#include "GenLbKernelAAOdd.h"
+
+#include "GenPackInfo.h"
+#include "GenPackInfoAAPush.h"
+#include "GenPackInfoAAPull.h"
+
+
 using namespace walberla;
 
 using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
@@ -43,13 +48,14 @@ int main( int argc, char **argv )
       Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
       // Reading parameters
       auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
       const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+            uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
       const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.08);
 
       // Creating fields
-      BlockDataID pdfFieldId = field::addToStorage< PdfField_T >( blocks, "pdfs", real_t( std::nan("") ), field::fzyx );
+      //BlockDataID pdfFieldId = field::addToStorage< PdfField_T >( blocks, "pdfs", real_t( std::nan("") ), field::fzyx );
+      BlockDataID pdfFieldId = field::addToStorage< PdfField_T >( blocks, "pdfs", 0.0, field::fzyx );
       BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
 
       pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
@@ -59,12 +65,38 @@ int main( int argc, char **argv )
       for( auto & b : *blocks)
           setterKernel(&b);
 
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
-      blockforest::communication::UniformBufferedScheme< Stencil_T > communication( blocks );
-      communication.addPackInfo( make_shared< pystencils::GenPackInfo >( pdfFieldId ) );
+      blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
+      twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
+
+      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
+      aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
+
+      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
+      aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
+
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
+      if( timeStepMode == "twoField")
+      {
+          timeLoop.add() << BeforeFunction(twoFieldComm, "communication" )
+                         << Sweep( pystencils::GenLbKernel(pdfFieldId), "LB stream & collide1" );
+          timeLoop.add() << BeforeFunction(twoFieldComm, "communication" )
+                         << Sweep( pystencils::GenLbKernel(pdfFieldId), "LB stream & collide2" );
+
+      } else if ( timeStepMode == "twoFieldKernelOnly") {
+          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId), "LB stream & collide1" );
+          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId), "LB stream & collide2" );
+      } else if ( timeStepMode == "aa") {
+          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId), "AA Even" );
+          timeLoop.add() << BeforeFunction( aaPullComm )
+                         << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId), "AA Odd")
+                         << AfterFunction( aaPushComm );
+      } else if ( timeStepMode == "aaKernelOnly") {
+          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId), "AA Even" );
+          timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId), "AA Odd");
+      } else {
+          WALBERLA_ABORT("Invalid value for timeStepMode ");
+      }
 
-      timeLoop.add() << BeforeFunction( communication, "communication" )
-                     << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide" );
 
       int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
       int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
@@ -108,6 +140,14 @@ int main( int argc, char **argv )
               WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
               simTimer.start();
               timeLoop.run();
+              /*
+              pystencils::GenLbKernelAAEven k1(pdfFieldId, omega);
+              pystencils::GenLbKernelAAOdd k2(pdfFieldId, omega);
+              for(int t=0; t < timesteps / 2; ++t)
+              { for( auto & b : *blocks) {
+                k1(&b);
+                k2(&b);
+              }}*/
               simTimer.end();
               WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
               auto time = simTimer.last();
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
index 7027cf7ee..2afbd8d18 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -4,8 +4,9 @@ from lbmpy.creationfunctions import create_lb_update_rule
 from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
 from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
 
-omega = sp.symbols("omega")
+omega = 1.6#sp.symbols("omega")
 omega_fill = sp.symbols("omega_:10")
 
 options_dict = {
@@ -18,6 +19,7 @@ options_dict = {
     'trt': {
         'method': 'trt',
         'stencil': 'D3Q19',
+        'compressible': False,
         'relaxation_rate': omega,
     },
     'mrt': {
@@ -74,16 +76,12 @@ const bool infoCsePdfs = {cse_pdfs};
 
 
 with CodeGeneration() as ctx:
-    accessor = StreamPullTwoFieldsAccessor()
-    assert not accessor.is_inplace, "This app does not work for inplace accessors"
-
     common_options = {
         'field_name': 'pdfs',
         'temporary_field_name': 'pdfs_tmp',
-        'kernel_type': accessor,
         'optimization': {'cse_global': False,
-                         'cse_pdfs': True,
-                         'split': True}
+                         'cse_pdfs': False,
+                         'split': False}
     }
     config_name = ctx.config
     noopt = False
@@ -101,9 +99,6 @@ with CodeGeneration() as ctx:
     options.update(common_options)
     options = options.copy()
 
-    if noopt:
-        options['optimization']['cse_global'] = False
-        options['optimization']['cse_pdfs'] = False
     if d3q27:
         options['stencil'] = 'D3Q27'
 
@@ -112,20 +107,32 @@ with CodeGeneration() as ctx:
     pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
     options['optimization']['symbolic_field'] = pdfs
 
-    update_rule = create_lb_update_rule(**options)
-    vec = {'nontemporal': True, 'assume_aligned': True, 'assume_inner_stride_one': True}
+    update_rule_two_field = create_lb_update_rule(**options)
+    update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(), **options)
+    options['optimization']['split'] = True
+    update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(), **options)
+
+    vec = {'nontemporal': False, 'assume_aligned': True, 'assume_inner_stride_one': True}
 
     # Sweeps
-    generate_sweep(ctx, 'GenLbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')])
-    setter_assignments = macroscopic_values_setter(update_rule.method, velocity=velocity_field.center_vector,
+    generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')])
+    generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info={'assume_aligned': True}, cpu_openmp=6, ghost_layers=1)
+    generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info={'assume_aligned': True}, cpu_openmp=6, ghost_layers=1)
+
+    setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
                                                    pdfs=pdfs.center_vector, density=1)
-    getter_assignments = macroscopic_values_getter(update_rule.method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector,  density=None)
+    getter_assignments = macroscopic_values_getter(update_rule_two_field.method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=None)
     generate_sweep(ctx, 'GenMacroSetter', setter_assignments)
     generate_sweep(ctx, 'GenMacroGetter', getter_assignments)
 
     # Communication
-    generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule, cpu_vectorize_info={'instruction_set': None})
+    generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule_two_field,
+                                   cpu_vectorize_info={'instruction_set': None})
+    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPull', update_rule_aa_odd, kind='pull',
+                                   cpu_vectorize_info={'instruction_set': None})
+    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push',
+                                   cpu_vectorize_info={'instruction_set': None})
 
     # Info Header
     infoHeaderParams = {
-- 
GitLab