//======================================================================================================================
//
//  This file is part of waLBerla. waLBerla is free software: you can 
//  redistribute it and/or modify it under the terms of the GNU General Public
//  License as published by the Free Software Foundation, either version 3 of 
//  the License, or (at your option) any later version.
//  
//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
//  for more details.
//  
//  You should have received a copy of the GNU General Public License along
//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGrid.cpp
//! \author Florian Schornbaum <florian.schornbaum@fau.de>
//
//======================================================================================================================

#include "blockforest/Initialization.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/StructuredBlockForest.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "blockforest/communication/UniformDirectScheme.h"
#include "blockforest/loadbalancing/Cartesian.h"

#include "boundary/BoundaryHandling.h"

#include "core/Abort.h"
#include "core/DataTypes.h"
#include "core/cell/CellInterval.h"
#include "core/config/Config.h"
#include "core/debug/Debug.h"
#include "core/debug/TestSubsystem.h"
#include "core/logging/Logging.h"
#include "core/math/IntegerFactorization.h"
#include "core/math/Vector3.h"
#include "core/mpi/Environment.h"
#include "core/mpi/MPIManager.h"
#include "core/mpi/MPIWrapper.h"
#include "core/mpi/Reduce.h"
#include "core/timing/TimingPool.h"

#include "domain_decomposition/SharedSweep.h"

#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/FlagUID.h"
#include "field/communication/PackInfo.h"
#include "field/communication/UniformMPIDatatypeInfo.h"
#include "field/iterators/FieldIterator.h"
#include "field/vtk/FlagFieldCellFilter.h"
#include "field/vtk/VTKWriter.h"

#include "lbm/BlockForestEvaluation.h"
#include "lbm/PerformanceEvaluation.h"
#include "lbm/boundary/NoSlip.h"
#include "lbm/boundary/SimpleUBB.h"
#include "lbm/communication/PdfFieldPackInfo.h"
#include "lbm/communication/PdfFieldMPIDatatypeInfo.h"
#include "lbm/field/AddToStorage.h"
#include "lbm/field/PdfField.h"
#include "lbm/lattice_model/CollisionModel.h"
#include "lbm/lattice_model/D3Q19.h"
#include "lbm/sweeps/CellwiseSweep.h"
#include "lbm/sweeps/SplitPureSweep.h"
#include "lbm/sweeps/SplitSweep.h"
#include "lbm/sweeps/SweepWrappers.h"
#include "lbm/vtk/Density.h"
#include "lbm/vtk/Velocity.h"

#include "postprocessing/sqlite/SQLite.h"

#include "stencil/D3Q19.h"
#include "stencil/D3Q27.h"

#include "timeloop/SweepTimeloop.h"

#include "vtk/BlockCellDataWriter.h"
#include "vtk/Initialization.h"
#include "vtk/VTKOutput.h"

#include <cstdlib>
#include <iostream>
#include <memory>



namespace uniform_grid {

///////////
// USING //
///////////

using namespace walberla;
using walberla::uint_t;
using walberla::real_t;

//////////////
// TYPEDEFS //
//////////////

typedef lbm::D3Q19< lbm::collision_model::SRT,           false > D3Q19_SRT_INCOMP;
typedef lbm::D3Q19< lbm::collision_model::SRT,           true  > D3Q19_SRT_COMP;
typedef lbm::D3Q19< lbm::collision_model::TRT,           false > D3Q19_TRT_INCOMP;
typedef lbm::D3Q19< lbm::collision_model::TRT,           true  > D3Q19_TRT_COMP;
typedef lbm::D3Q19< lbm::collision_model::D3Q19MRT,      false > D3Q19_MRT_INCOMP;
typedef lbm::D3Q27< lbm::collision_model::D3Q27Cumulant, true  > D3Q27_CUMULANT_COMP;


template< typename LatticeModel_T >
struct Types
{
   using Stencil_T = typename LatticeModel_T::Stencil;
   using CommunicationStencil_T = typename LatticeModel_T::CommunicationStencil;
   using PdfField_T = lbm::PdfField< LatticeModel_T >;
};

using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;

const uint_t FieldGhostLayers = 1;

///////////
// FLAGS //
///////////

const FlagUID  Fluid_Flag( "fluid" );
const FlagUID    UBB_Flag( "velocity bounce back" );
const FlagUID NoSlip_Flag( "no slip" );






/////////////////////
// OUTPUT HELPERS  //
/////////////////////

template< typename LatticeModel_T, class Enable = void >
struct StencilString;

template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value >::type >
{
   static const char * str() { return "D3Q19"; }
};


template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value >::type >
{
  static const char * str() { return "D3Q27"; }
};

template< typename LatticeModel_T, class Enable = void >
struct CollisionModelString;

template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                          lbm::collision_model::SRT_tag >::value >::type >
{
   static const char * str() { return "SRT"; }
};

template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                          lbm::collision_model::TRT_tag >::value >::type >
{
   static const char * str() { return "TRT"; }
};

template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                          lbm::collision_model::MRT_tag >::value >::type >
{
   static const char * str() { return "MRT"; }
};

template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename boost::enable_if_c< boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                          lbm::collision_model::Cumulant_tag >::value >::type >
{
  static const char * str() { return "Cumulant"; }
};





/////////////
// CONFIG  //
/////////////

static inline void getCells( const Config::BlockHandle & configBlock, uint_t & xCells, uint_t & yCells, uint_t & zCells )
{
   xCells = configBlock.getParameter< uint_t >( "xCells", 10 );
   yCells = configBlock.getParameter< uint_t >( "yCells", 10 );
   zCells = configBlock.getParameter< uint_t >( "zCells", 10 );
}

static inline void getCellsAndProcesses( const Config::BlockHandle & configBlock,
                                         const uint_t numberOfProcesses, const uint_t blocksPerProcess,
                                         uint_t & xCells, uint_t & yCells, uint_t & zCells,
                                         uint_t & xProcesses, uint_t & yProcesses, uint_t & zProcesses,
                                         uint_t & xBlocks, uint_t & yBlocks, uint_t & zBlocks )
{
   getCells( configBlock, xCells, yCells, zCells );

   std::vector< real_t > weighting;
   weighting.push_back( configBlock.getParameter< real_t >( "xWeight", real_t(1) ) / real_c(xCells) );
   weighting.push_back( configBlock.getParameter< real_t >( "yWeight", real_t(1) ) / real_c(yCells) );
   weighting.push_back( configBlock.getParameter< real_t >( "zWeight", real_t(1) ) / real_c(zCells) );

   std::vector< uint_t > processes = math::getFactors( numberOfProcesses, 3, weighting );

   WALBERLA_CHECK_EQUAL( processes.size(), 3 );
   WALBERLA_CHECK_EQUAL( processes[0] * processes[1] * processes[2], numberOfProcesses );

   xProcesses = processes[0];
   yProcesses = processes[1];
   zProcesses = processes[2];

   Vector3< uint_t > blocksPerProcess3D = math::getFactors3D( blocksPerProcess );

   xBlocks = processes[0] * blocksPerProcess3D[0];
   yBlocks = processes[1] * blocksPerProcess3D[1];
   zBlocks = processes[2] * blocksPerProcess3D[2];
}






///////////////////////////
// BLOCK STRUCTURE SETUP //
///////////////////////////

void createSetupBlockForest( blockforest::SetupBlockForest & sforest, const Config::BlockHandle & configBlock, const uint_t numberOfProcesses, const uint_t blocksPerProcess )
{
   uint_t numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock;
   uint_t numberOfXProcesses, numberOfYProcesses, numberOfZProcesses;
   uint_t numberOfXBlocks, numberOfYBlocks, numberOfZBlocks;

   getCellsAndProcesses( configBlock, numberOfProcesses, blocksPerProcess,
                         numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock,
                         numberOfXProcesses, numberOfYProcesses, numberOfZProcesses,
                         numberOfXBlocks, numberOfYBlocks, numberOfZBlocks );

   sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );

   sforest.init( AABB( real_t(0), real_t(0), real_t(0), real_c( numberOfXBlocks * numberOfXCellsPerBlock ),
                                                        real_c( numberOfYBlocks * numberOfYCellsPerBlock ),
                                                        real_c( numberOfZBlocks * numberOfZCellsPerBlock ) ),
                                                        numberOfXBlocks, numberOfYBlocks, numberOfZBlocks, false, false, false );

   if( MPIManager::instance()->numProcesses() > 1 )
   {
      shared_ptr< std::vector<uint_t> > processIdMap;

      WALBERLA_MPI_SECTION()
      {
         if ( MPIManager::instance()->isCartesianCommValid() )
         {
            MPIManager::instance()->createCartesianComm(numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, false, false, false);

            processIdMap = make_shared<std::vector<uint_t> >(numberOfXProcesses * numberOfYProcesses * numberOfZProcesses);

            for (uint_t z = 0; z != numberOfZProcesses; ++z) {
               for (uint_t y = 0; y != numberOfYProcesses; ++y) {
                  for (uint_t x = 0; x != numberOfXProcesses; ++x)
                  {
                     (*processIdMap)[z * numberOfXProcesses * numberOfYProcesses + y * numberOfXProcesses + x] =
                           uint_c(MPIManager::instance()->cartesianRank(x, y, z));
                  }
               }
            }
         }
         else {
            WALBERLA_LOG_WARNING_ON_ROOT( "Your version of OpenMPI contains a bug. See waLBerla issue #73 for more "
                                          "information. As a workaround, MPI_COMM_WORLD instead of a "
                                          "Cartesian MPI communicator is used." );
            MPIManager::instance()->useWorldComm();
         }
      }
      else
         MPIManager::instance()->useWorldComm();

      sforest.balanceLoad( blockforest::CartesianDistribution( numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, processIdMap.get() ),
                           numberOfXProcesses * numberOfYProcesses * numberOfZProcesses );
   }
   else
   {
      MPIManager::instance()->useWorldComm();

      sforest.balanceLoad( blockforest::CartesianDistribution( numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, nullptr ),
                           numberOfXProcesses * numberOfYProcesses * numberOfZProcesses, real_t(0), 0, true );
   }

   WALBERLA_LOG_INFO_ON_ROOT( "SetupBlockForest created successfully:\n" << sforest );
}



shared_ptr< blockforest::StructuredBlockForest > createStructuredBlockForest( const Config::BlockHandle & configBlock )
{
   uint_t numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock;
   getCells( configBlock, numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock );

   const uint_t blocksPerProcess = configBlock.getParameter< uint_t >( "blocksPerProcess", uint_t( 1 ) );

   if( configBlock.isDefined( "sbffile" ) )
   {
      std::string sbffile = configBlock.getParameter< std::string >( "sbffile" );

      WALBERLA_LOG_INFO_ON_ROOT( "Creating the block structure: loading from file \'" << sbffile << "\' ..." );

      return blockforest::createUniformBlockGrid( sbffile, numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock, false );
   }

   WALBERLA_LOG_INFO_ON_ROOT( "Creating the block structure ..." );

   blockforest::SetupBlockForest sforest;
   createSetupBlockForest( sforest, configBlock, uint_c( MPIManager::instance()->numProcesses() ), blocksPerProcess );

   auto bf = std::make_shared< blockforest::BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );

   auto sbf = std::make_shared< blockforest::StructuredBlockForest >( bf, numberOfXCellsPerBlock,
                                                                                                            numberOfYCellsPerBlock,
                                                                                                            numberOfZCellsPerBlock );
   sbf->createCellBoundingBoxes();

   return sbf;
}






///////////////////////
// BOUNDARY HANDLING //
///////////////////////

template< typename LatticeModel_T >
class MyBoundaryHandling
{
public:

   typedef lbm::NoSlip< LatticeModel_T, flag_t >    NoSlip_T;
   typedef lbm::SimpleUBB< LatticeModel_T, flag_t > UBB_T;

   typedef boost::tuples::tuple< NoSlip_T, UBB_T > BoundaryConditions_T;

   typedef BoundaryHandling< FlagField_T, typename Types<LatticeModel_T>::Stencil_T, BoundaryConditions_T > BoundaryHandling_T;



   MyBoundaryHandling( const BlockDataID & flagField, const BlockDataID & pdfField, const real_t velocity ) :
      flagField_( flagField ), pdfField_( pdfField ), velocity_( velocity ) {}

   BoundaryHandling_T * operator()( IBlock* const block, const StructuredBlockStorage* const storage ) const;

private:

   const BlockDataID flagField_;
   const BlockDataID  pdfField_;

   const real_t velocity_;

}; // class MyBoundaryHandling

template< typename LatticeModel_T >
typename MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T *
MyBoundaryHandling<LatticeModel_T>::operator()( IBlock * const block, const StructuredBlockStorage * const storage ) const
{
   using PdfField_T = typename Types< LatticeModel_T >::PdfField_T;

   WALBERLA_ASSERT_NOT_NULLPTR( block );
   WALBERLA_ASSERT_NOT_NULLPTR( storage );

   FlagField_T * flagField = block->getData< FlagField_T >( flagField_ );
   PdfField_T *   pdfField = block->getData< PdfField_T > (  pdfField_ );

   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );

   BoundaryHandling_T * handling = new BoundaryHandling_T( "boundary handling", flagField, fluid,
         boost::tuples::make_tuple( NoSlip_T( "no slip", NoSlip_Flag, pdfField ),
                                       UBB_T( "velocity bounce back", UBB_Flag, pdfField, velocity_, real_c(0), real_c(0) ) ) );

   CellInterval domainBB = storage->getDomainCellBB();
   storage->transformGlobalToBlockLocalCellInterval( domainBB, *block );

   // no slip WEST
   CellInterval west( domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMin(), domainBB.yMax(), domainBB.zMax() );
   handling->forceBoundary( NoSlip_Flag, west );

   // no slip EAST
   CellInterval east( domainBB.xMax(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax() );
   handling->forceBoundary( NoSlip_Flag, east );

   // no slip SOUTH
   CellInterval south( domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMin(), domainBB.zMax() );
   handling->forceBoundary( NoSlip_Flag, south );

   // no slip NORTH
   CellInterval north( domainBB.xMin(), domainBB.yMax(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax() );
   handling->forceBoundary( NoSlip_Flag, north );

   // no slip BOTTOM
   CellInterval bottom( domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMin() );
   handling->forceBoundary( NoSlip_Flag, bottom );

   // velocity bounce back TOP
   CellInterval top( domainBB.xMin(), domainBB.yMin(), domainBB.zMax(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax() );
   handling->forceBoundary( UBB_Flag, top );

   handling->fillWithDomain( domainBB );

   return handling;
}






/////////
// VTK //
/////////

template< typename LatticeModel_T >
class MyVTKOutput {

public:

   MyVTKOutput( const ConstBlockDataID & pdfField, const ConstBlockDataID & flagField,
                const vtk::VTKOutput::BeforeFunction& pdfGhostLayerSync ) :
      pdfField_( pdfField ), flagField_( flagField ), pdfGhostLayerSync_( pdfGhostLayerSync ) {}

   void operator()( std::vector< shared_ptr<vtk::BlockCellDataWriterInterface> > & writers,
                    std::map< std::string, vtk::VTKOutput::CellFilter > &          filters,
                    std::map< std::string, vtk::VTKOutput::BeforeFunction > &      beforeFunctions );

private:

   const ConstBlockDataID pdfField_;
   const ConstBlockDataID flagField_;

   vtk::VTKOutput::BeforeFunction pdfGhostLayerSync_;

}; // class MyVTKOutput

template< typename LatticeModel_T >
void MyVTKOutput<LatticeModel_T>::operator()( std::vector< shared_ptr<vtk::BlockCellDataWriterInterface> > & writers,
                                              std::map< std::string, vtk::VTKOutput::CellFilter > &          filters,
                                              std::map< std::string, vtk::VTKOutput::BeforeFunction > &      beforeFunctions )
{
   // block data writers

   writers.push_back( make_shared< lbm::VelocityVTKWriter<LatticeModel_T> >( pdfField_, "VelocityFromPDF" ) );
   writers.push_back( make_shared< lbm::DensityVTKWriter<LatticeModel_T> >( pdfField_, "DensityFromPDF" ) );

   writers.push_back( make_shared< field::VTKWriter< FlagField_T > >( flagField_, "FlagField" ) );

   // cell filters

   field::FlagFieldCellFilter<FlagField_T> fluidFilter( flagField_ );
   fluidFilter.addFlag( Fluid_Flag );
   filters[ "FluidFilter" ] = fluidFilter;

   field::FlagFieldCellFilter<FlagField_T> obstacleFilter( flagField_ );
   obstacleFilter.addFlag( NoSlip_Flag );
   obstacleFilter.addFlag(    UBB_Flag );
   filters[ "ObstacleFilter" ] = obstacleFilter;

   // before functions

   beforeFunctions[ "PDFGhostLayerSync" ] = pdfGhostLayerSync_;
}






////////////////////
// THE SIMULATION //
////////////////////



template< typename LatticeModel_T, class Enable = void >
struct AddLB
{
   using PdfField = typename Types< LatticeModel_T >::PdfField_T;
   using CommunicationStencil = typename Types< LatticeModel_T >::CommunicationStencil_T;

   static void add( shared_ptr< blockforest::StructuredBlockForest > & blocks, SweepTimeloop & timeloop,
                    const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId,
                    const bool split, const bool pure, const bool fullComm, const bool fused, const bool directComm )
   {
      // setup of the LB communication for synchronizing the pdf field between neighboring blocks

      std::function< void () > commFunction;
      if( directComm )
      {
         if( fullComm )
         {
            blockforest::communication::UniformDirectScheme< stencil::D3Q27 > comm( blocks );
            auto mpiDatatypeInfo = make_shared<field::communication::UniformMPIDatatypeInfo< PdfField > >( pdfFieldId );
            comm.addDataToCommunicate( mpiDatatypeInfo );
            commFunction = comm;
         }
         else
         {
            blockforest::communication::UniformDirectScheme< CommunicationStencil > comm( blocks );
            auto mpiDatatypeInfo = make_shared<lbm::communication::PdfFieldMPIDatatypeInfo< PdfField > >( pdfFieldId );
            comm.addDataToCommunicate( mpiDatatypeInfo );
            commFunction = comm;
         }
      }
      else
      {
         if( fullComm )
         {
            blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > scheme( blocks );
            scheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField > >( pdfFieldId ) );
            commFunction = scheme;
         }
         else
         {
            blockforest::communication::UniformBufferedScheme< CommunicationStencil > scheme( blocks );
            scheme.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) );
            commFunction = scheme;
         }
      }

      if( fused )
      {
         timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                        << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );

         if( split )
         {
            if( pure )
               timeloop.add() << Sweep( lbm::SplitPureSweep< LatticeModel_T >( pdfFieldId ), "split pure LB sweep (stream & collide)" );
            else
               timeloop.add() << Sweep( lbm::SplitSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, Fluid_Flag ), "split LB sweep (stream & collide)" );
         }
         else
            timeloop.add() << Sweep( makeSharedSweep( lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, Fluid_Flag ) ), "cell-wise LB sweep (stream & collide)" );
      }
      else
      {
         if( split )
         {
            if( pure )
            {
               using Sweep_T = lbm::SplitPureSweep< LatticeModel_T >;
               auto sweep = make_shared< Sweep_T >( pdfFieldId );

               timeloop.add() << Sweep( lbm::CollideSweep< Sweep_T >( sweep ), "split pure LB sweep (collide)" );
               timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                              << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );
               timeloop.add() << Sweep( lbm::StreamSweep< Sweep_T >( sweep ), "split pure LB sweep (stream)" );
            }
            else
            {
               typedef lbm::SplitSweep< LatticeModel_T, FlagField_T > Sweep_T;
               auto sweep = make_shared< Sweep_T >( pdfFieldId, flagFieldId, Fluid_Flag );

               timeloop.add() << Sweep( lbm::CollideSweep< Sweep_T >( sweep ), "split LB sweep (collide)" );
               timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                              << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );
               timeloop.add() << Sweep( lbm::StreamSweep< Sweep_T >( sweep ), "split LB sweep (stream)" );
            }
         }
         else
         {
            auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, Fluid_Flag );

            timeloop.add() << Sweep( lbm::makeCollideSweep( sweep ), "cell-wise LB sweep (collide)" );
            timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                           << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );
            timeloop.add() << Sweep( lbm::makeStreamSweep( sweep ), "cell-wise LB sweep (stream)" );
         }
      }
   }
};

template< typename LatticeModel_T  >
struct AddLB< LatticeModel_T, typename boost::enable_if_c< boost::mpl::or_<
                                                                           boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                           lbm::collision_model::MRT_tag >,
                                                                           boost::is_same< typename LatticeModel_T::CollisionModel::tag,
                                                                                           lbm::collision_model::Cumulant_tag >
                                                                          >::value >::type >
{
   using PdfField = typename Types< LatticeModel_T >::PdfField_T;
   using CommunicationStencil = typename Types< LatticeModel_T >::CommunicationStencil_T;

   static void add( shared_ptr< blockforest::StructuredBlockForest > & blocks, SweepTimeloop & timeloop,
                    const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId,
                    const bool /*split*/, const bool /*pure*/, const bool fullComm, const bool fused, const bool directComm )
   {
      // setup of the LB communication for synchronizing the pdf field between neighboring blocks

      std::function< void () > commFunction;
      if( directComm )
      {
         
         if( fullComm )
         {
            blockforest::communication::UniformDirectScheme< stencil::D3Q27 > comm( blocks );
            auto mpiDatatypeInfo = make_shared<field::communication::UniformMPIDatatypeInfo< PdfField > >( pdfFieldId );
            comm.addDataToCommunicate( mpiDatatypeInfo );
            commFunction = comm;
         }
         else
         {
            blockforest::communication::UniformDirectScheme< CommunicationStencil > comm( blocks );
            auto mpiDatatypeInfo = make_shared<lbm::communication::PdfFieldMPIDatatypeInfo< PdfField > >( pdfFieldId );
            comm.addDataToCommunicate( mpiDatatypeInfo );
            commFunction = comm;
         }
      }
      else
      {
         if( fullComm )
         {
             blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > scheme( blocks );
             scheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField > >( pdfFieldId ) );
             commFunction = scheme;
         }
         else
         {
            blockforest::communication::UniformBufferedScheme< CommunicationStencil > scheme( blocks );
            scheme.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) );
            commFunction = scheme;
         }
      }

      if( fused )
      {
         timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                        << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );
         timeloop.add() << Sweep( makeSharedSweep( lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, Fluid_Flag ) ), "cell-wise LB sweep (stream & collide)" );
      }
      else
      {
         auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, Fluid_Flag );

         timeloop.add() << Sweep( lbm::makeCollideSweep( sweep ), "cell-wise LB sweep (collide)" );
         timeloop.add() << BeforeFunction( commFunction, "LB communication" )
                        << Sweep( MyBoundaryHandling<LatticeModel_T>::BoundaryHandling_T::getBlockSweep( boundaryHandlingId ), "LB boundary sweep" );
         timeloop.add() << Sweep( lbm::makeStreamSweep( sweep ), "cell-wise LB sweep (stream)" );
      }
   }
};



template< typename LatticeModel_T >
void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeModel,
          const bool split, const bool pure, const bool fzyx, const bool fullComm, const bool fused, const bool directComm )
{
   using PdfField = typename Types<LatticeModel_T>::PdfField_T;

   Config::BlockHandle configBlock = config->getBlock( "UniformGrid" );

   // creating the block structure

   auto blocks = createStructuredBlockForest( configBlock );

   // add pdf field to blocks

   BlockDataID pdfFieldId = fzyx ? lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel,
                                                              Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_t(1),
                                                              FieldGhostLayers, field::fzyx ) :
                                   lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel,
                                                              Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_t(1),
                                                              FieldGhostLayers, field::zyxf );

   // add flag field to blocks

   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );

   // add LB boundary handling to blocks

   const real_t velocity = configBlock.getParameter< real_t >( "velocity", real_t(0.05) );

   BlockDataID boundaryHandlingId = blocks->template addStructuredBlockData< typename MyBoundaryHandling< LatticeModel_T >::BoundaryHandling_T >(
            MyBoundaryHandling< LatticeModel_T >( flagFieldId, pdfFieldId, velocity ), "boundary handling" );

   // creating the time loop

   const uint_t outerTimeSteps = configBlock.getParameter< uint_t >( "outerTimeSteps", uint_c(1 ) );
   const uint_t innerTimeSteps = configBlock.getParameter< uint_t >( "innerTimeSteps", uint_c(10) );

   SweepTimeloop timeloop( blocks->getBlockStorage(), outerTimeSteps * innerTimeSteps );

   // VTK

   blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > pdfGhostLayerSync( blocks );
   pdfGhostLayerSync.addPackInfo( make_shared< field::communication::PackInfo< PdfField > >( pdfFieldId ) );

   MyVTKOutput< LatticeModel_T > myVTKOutput( pdfFieldId, flagFieldId, pdfGhostLayerSync );

   std::map< std::string, vtk::SelectableOutputFunction > vtkOutputFunctions;
   vtk::initializeVTKOutput( vtkOutputFunctions, myVTKOutput, blocks, config );

   for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
      timeloop.addFuncBeforeTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
                                      output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );

   // add LB kernel, boundary handling, and communication to time loop

   AddLB< LatticeModel_T >::add( blocks, timeloop, pdfFieldId, flagFieldId, boundaryHandlingId, split, pure, fullComm, fused, directComm );

   // logging right before the benchmark starts

   lbm::BlockForestEvaluation< FlagField_T > blockForest( blocks, flagFieldId, Fluid_Flag );
   blockForest.logInfoOnRoot();

   WALBERLA_LOG_INFO_ON_ROOT( "Benchmark parameters:"
                              "\n- collision model:                 " << CollisionModelString< LatticeModel_T >::str() <<
                              "\n- stencil:                         " << StencilString< LatticeModel_T >::str() <<
                              "\n- compressible:                    " << ( LatticeModel_T::compressible ? "yes" : "no" ) <<
                              "\n- fused (stream & collide) kernel: " << ( fused ? "yes" : "no" ) <<
                              "\n- split (collision) kernel:        " << ( split ? "yes" : "no" ) <<
                              "\n- pure kernel:                     " << ( pure ? "yes (collision is also performed within obstacle cells)" : "no" ) <<
                              "\n- data layout:                     " << ( fzyx ? "fzyx (structure of arrays [SoA])" : "zyxf (array of structures [AoS])" ) <<
                              "\n- communication:                   " << ( fullComm ? "full synchronization" : "direction-aware optimizations" ) <<
                              "\n- direct communication:            " << ( directComm ? "enabled" : "disabled" ) );

   // run the benchmark

   lbm::PerformanceEvaluation< FlagField_T > performance( blocks, flagFieldId, Fluid_Flag );

   for( uint_t outerRun = 0; outerRun < outerTimeSteps; ++outerRun )
   {
      WcTimingPool timeloopTiming;

      WALBERLA_MPI_WORLD_BARRIER();
      WcTimer timer;
      timer.start();

      for( uint_t innerRun = 0; innerRun < innerTimeSteps; ++innerRun )
         timeloop.singleStep( timeloopTiming );

      timer.end();

      double time = timer.max();
      mpi::reduceInplace( time, mpi::MAX );

      const auto reducedTimeloopTiming = timeloopTiming.getReduced();

      WALBERLA_LOG_RESULT_ON_ROOT( "Time loop timing:\n" << *reducedTimeloopTiming );

      performance.logResultOnRoot( innerTimeSteps, time );

      WALBERLA_ROOT_SECTION()
      {
         // logging in SQL database

         if( configBlock.getParameter< bool >( "logToSqlDB", true ) )
         {
            const std::string sqlFile = configBlock.getParameter< std::string >( "sqlFile", "performance.sqlite" );

            std::map< std::string, int >         integerProperties;
            std::map< std::string, double >      realProperties;
            std::map< std::string, std::string > stringProperties;

            performance.getResultsForSQLOnRoot( integerProperties, realProperties, stringProperties, innerTimeSteps, time );
            blockForest.getResultsForSQLOnRoot( integerProperties, realProperties, stringProperties );

            stringProperties[ "collisionModel" ]    = CollisionModelString< LatticeModel_T >::str();
            stringProperties[ "stencil" ]           = StencilString< LatticeModel_T >::str();
            stringProperties[ "compressible" ]      = ( LatticeModel_T::compressible ? "yes" : "no" );
            stringProperties[ "fusedKernel" ]       = ( fused ? "yes" : "no" );
            stringProperties[ "splitKernel" ]       = ( split ? "yes" : "no" );
            stringProperties[ "pureKernel" ]        = ( pure ? "yes" : "no" );
            stringProperties[ "dataLayout" ]        = ( fzyx ? "fzyx" : "zyxf" );
            stringProperties[ "fullCommunication" ] = ( fullComm ? "yes" : "no" );
            stringProperties[ "directComm"]         = ( directComm ? "yes" : "no" );

            auto runId = postprocessing::storeRunInSqliteDB( sqlFile, integerProperties, stringProperties, realProperties );
            postprocessing::storeTimingPoolInSqliteDB( sqlFile, runId, *reducedTimeloopTiming, "Timeloop" );
         }
      }
   }

   // logging once again at the end of the simulation, identical to logging at the beginning :-)

   blockForest.logInfoOnRoot();

   WALBERLA_LOG_INFO_ON_ROOT( "Benchmark parameters:"
                              "\n- collision model:                 " << CollisionModelString< LatticeModel_T >::str() <<
                              "\n- stencil:                         " << StencilString< LatticeModel_T >::str() <<
                              "\n- compressible:                    " << ( LatticeModel_T::compressible ? "yes" : "no" ) <<
                              "\n- fused (stream & collide) kernel: " << ( fused ? "yes" : "no" ) <<
                              "\n- split (collision) kernel:        " << ( split ? "yes" : "no" ) <<
                              "\n- pure kernel:                     " << ( pure ? "yes (collision is also performed within obstacle cells)" : "no" ) <<
                              "\n- data layout:                     " << ( fzyx ? "fzyx (structure of arrays [SoA])" : "zyxf (array of structures [AoS])" ) <<
                              "\n- communication:                   " << ( fullComm ? "full synchronization" : "direction-aware optimizations" ) <<
                              "\n- direct communication:            " << ( directComm ? "enabled" : "disabled" ) );

}


//////////
// MAIN //
//////////

enum CM { CMSRT, CMTRT, CMMRT, CMCUM };

int main( int argc, char **argv )
{
   mpi::Environment env( argc, argv );

   if( argc < 2 )
   {
      WALBERLA_ROOT_SECTION()
      {
         std::cout << "Usage: " << argv[0] << " path-to-configuration-file [--trt | --mrt] [--comp] [--not-split] [--not-pure] [--zyxf] [--full-comm] [--not-fused] [--direct-comm]\n"
                      "\n"
                      "By default, SRT is selected as collision model, a communication with direction-aware optimizations is chosen, and an\n"
                      "incompressible, split, pure LB kernel is executed on a PDF field with layout 'fzyx' (= structure of arrays [SoA]).\n"
                      "\n"
                      "Optional arguments:\n"
                      " --trt:         collision model = TRT\n"
                      " --mrt:         collision model = MRT\n"
                      " --cumulant     collision model = cumulant\n"
                      " --comp:        LB kernel is switched from incompressible to compressible\n"
                      " --not-split:   LB kernel NOT split by PDF direction but executed cell by cell\n"
                      " --not-pure:    LB kernel is only executed in fluid cells, not in obstacle/boundary cells.\n"
                      "                Will be automatically selected for non-split LB kernels.\n"
                      " --zyxf:        data layout switched to 'zyxf' (array of structures [AoS])\n"
                      "                Probably the best layout for non-split kernels.\n"
                      " --full-comm:   A full synchronization of neighboring blocks is performed instead of using a communication\n"
                      "                that uses direction-aware optimizations.\n"
                      " --not-fused:   Selects separate LB kernels for collision and streaming.\n"
                      "                By default, a 'fused' stream & collide kernel is used.\n"
                      " --direct-comm: Enables bufferless direct communication\n"
                      "\n"
                      "Please note: For small/very small blocks (i.e., blocks with only few cells), the best performance may be achieved with\n"
                      "             basic (non-split), incompressible LB kernels combined with an array of structures ('zyxf') data layout!" << std::endl;
      }
      return EXIT_SUCCESS;
   }

   logging::Logging::printHeaderOnStream();
   //WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }

#ifdef _OPENMP
   if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )
      WALBERLA_ABORT( "If you are using a version of the benchmark that was compiled with OpenMP you have to "
                      "specify the environment variable \'OMP_NUM_THREADS\' accordingly!" );
#endif

   // open configuration file

   shared_ptr< Config > config = make_shared< Config >();
   config->readParameterFile( argv[1] );

   Config::BlockHandle configBlock = config->getBlock( "UniformGrid" );

   if( !configBlock )
      WALBERLA_ABORT( "You have to specify a \"UniformGrid\" block in the configuration file!" );

   // In case 'sbffile' and 'processes' are specified in the configuration file:
   // -> just create the block structure and save it to file for later use

   if( configBlock.isDefined( "sbffile" ) && configBlock.isDefined( "processes" ) )
   {
      std::string  sbffile           = configBlock.getParameter< std::string >( "sbffile" );
      const uint_t numberOfProcesses = configBlock.getParameter< uint_t >( "processes" );
      const uint_t blocksPerProcess  = configBlock.getParameter< uint_t >( "blocksPerProcess", uint_t( 1 ) );

      std::ostringstream infoString;
      infoString << "You have selected the option of just creating the block structure (= domain decomposition) and saving the result to file\n"
                    "by specifying the output file name \'" << sbffile << "\' AND providing a targeted number of processes ("
                 << numberOfProcesses << ").\n";

      if( MPIManager::instance()->numProcesses() > 1 )
         WALBERLA_ABORT( infoString.str() << "In this mode you need to start " << argv[0] << " with just one process!" );

      WALBERLA_LOG_INFO_ON_ROOT( infoString.str() << "Creating the block structure ..." );

      blockforest::SetupBlockForest sforest;
      createSetupBlockForest( sforest, configBlock, numberOfProcesses, blocksPerProcess );

      sforest.saveToFile( sbffile.c_str() );

      logging::Logging::printFooterOnStream();
      return EXIT_SUCCESS;
   }

   // reading optional parameters from passed arguments

   CM collisionModel = CMSRT;
   bool compressible = false;
   bool split        = true;
   bool pure         = true;
   bool fzyx         = true;
   bool fullComm     = false;
   bool fused        = true;
   bool directComm   = false;

   for( int i = 2; i < argc; ++i )
   {
      if( std::strcmp( argv[i], "--trt" )         == 0 ) collisionModel = CMTRT;
      if( std::strcmp( argv[i], "--mrt" )         == 0 ) collisionModel = CMMRT;
      if( std::strcmp( argv[i], "--cumulant" )    == 0 ) collisionModel = CMCUM;
      if( std::strcmp( argv[i], "--comp" )        == 0 ) compressible   = true;
      if( std::strcmp( argv[i], "--not-split" )   == 0 ) split          = false;
      if( std::strcmp( argv[i], "--not-pure" )    == 0 ) pure           = false;
      if( std::strcmp( argv[i], "--zyxf" )        == 0 ) fzyx           = false;
      if( std::strcmp( argv[i], "--full-comm" )   == 0 ) fullComm       = true;
      if( std::strcmp( argv[i], "--not-fused" )   == 0 ) fused          = false;
      if( std::strcmp( argv[i], "--direct-comm" ) == 0 ) directComm     = true;
   }

   if( pure && !split )
   {
      WALBERLA_LOG_WARNING_ON_ROOT( "You called the benchmark with \"--not-split\" but without \"--not-pure\".\n"
                                    "\"Pure\" kernels are only available for \"split\" kernels! Setting \"pure\" to false ..." );
      pure = pure && split; // pure only works in combination with split
   }

   if( collisionModel == CMMRT && ( compressible || split || pure ) )
   {
      WALBERLA_LOG_WARNING_ON_ROOT( "Option \"--comp\" is not available for MRT! Also, MRT requires options \"--not-split\" and \"--not-pure\"!\n"
                                    "Setting \"compressible\", \"split\", and \"pure\" to false ..." );
      compressible = false;
      split        = false;
      pure         = false;
   }
   if( collisionModel == CMCUM && ( compressible || split || pure ) )
   {
      WALBERLA_LOG_WARNING_ON_ROOT( "Option \"--comp\" has to be set for Cumulant! Also, Cumulant requires options \"--not-split\" and \"--not-pure\"!\n"
                                            "Setting \"compressible\", to true, \"split\", and \"pure\" to false ..." );
      compressible = true;
      split        = false;
      pure         = false;
   }

   WALBERLA_NON_MPI_SECTION()
   {
      if( directComm )
      {
         WALBERLA_LOG_WARNING_ON_ROOT( "Direct (bufferless) communication is not available when building with MPI disabled.\n"
                                       "Switching to buffered comm..." );
      }
      directComm = false;
   }

   const real_t omega = configBlock.getParameter< real_t >( "omega", real_t(1.4) ); // on the coarsest grid!

   // executing benchmark

   if( collisionModel == CMSRT ) // SRT
   {
      if( compressible )
      {
         D3Q19_SRT_COMP latticeModel = D3Q19_SRT_COMP( lbm::collision_model::SRT( omega ) );
         run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
      }
      else
      {
         D3Q19_SRT_INCOMP latticeModel = D3Q19_SRT_INCOMP( lbm::collision_model::SRT( omega ) );
         run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
      }
   }
   else if( collisionModel == CMTRT ) // TRT
   {
      if( compressible )
      {
         D3Q19_TRT_COMP latticeModel = D3Q19_TRT_COMP( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
         run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
      }
      else
      {
         D3Q19_TRT_INCOMP latticeModel = D3Q19_TRT_INCOMP( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
         run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
      }
   }
   else if( collisionModel == CMMRT ) // MRT
   {
      D3Q19_MRT_INCOMP latticeModel = D3Q19_MRT_INCOMP( lbm::collision_model::D3Q19MRT::constructTRTWithMagicNumber( omega ) );
      run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
   }
   else  // Cumulant
   {
      D3Q27_CUMULANT_COMP latticeModel = D3Q27_CUMULANT_COMP( lbm::collision_model::D3Q27Cumulant(omega) );
      run( config, latticeModel, split, pure, fzyx, fullComm, fused, directComm );
   }

   logging::Logging::printFooterOnStream();
   return EXIT_SUCCESS;
}


} // namespace uniform_grid

int main( int argc, char ** argv )
{
	return uniform_grid::main( argc, argv );
}