#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hyteg/ginkgo/GinkgoCommunication.hpp" #include "hyteg/ginkgo/GinkgoDirichletHandling.hpp" #include "hyteg/ginkgo/GinkgoSparseMatrixProxy.hpp" #include "hyteg/ginkgo/GinkgoVectorProxy.hpp" #include "hyteg/solvers/Solver.hpp" namespace hyteg { void gatherIndices( std::vector< int32_t >& velocityIndices, std::vector< int32_t >& pressureIndices, const PrimitiveStorage& storage, uint_t level, const P2P1TaylorHoodFunction< int32_t >& numerator ) { for ( auto dof : FunctionIterator< vertexdof::VertexDoFFunction< int32_t > >( numerator.uvw[0].getVertexDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } for ( auto dof : FunctionIterator< EdgeDoFFunction< int32_t > >( numerator.uvw[0].getEdgeDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } for ( auto dof : FunctionIterator< vertexdof::VertexDoFFunction< int32_t > >( numerator.uvw[1].getVertexDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } for ( auto dof : FunctionIterator< EdgeDoFFunction< int32_t > >( numerator.uvw[1].getEdgeDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } if ( storage.hasGlobalCells() ) { for ( auto dof : FunctionIterator< vertexdof::VertexDoFFunction< int32_t > >( numerator.uvw[2].getVertexDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } for ( auto dof : FunctionIterator< EdgeDoFFunction< int32_t > >( numerator.uvw[2].getEdgeDoFFunction(), level ) ) { velocityIndices.push_back( dof.value() ); } } for ( auto dof : FunctionIterator< vertexdof::VertexDoFFunction< int32_t > >( numerator.p, level ) ) { pressureIndices.push_back( dof.value() ); } } template < typename OperatorType > class GinkgoBlockSolver : public Solver< OperatorType > { using BlockPreconditioner_T = typename OperatorType::BlockPreconditioner_T; public: using FunctionType = typename OperatorType::srcType; using valueType = typename FunctionType::valueType; using indexType = int32_t; using mtx = gko::distributed::Matrix< valueType, int32_t >; using csr = gko::matrix::Csr< valueType, int32_t >; using vec = gko::distributed::Vector< valueType, int32_t >; using dense = gko::matrix::Dense< valueType >; explicit GinkgoBlockSolver( const std::shared_ptr< PrimitiveStorage >& storage, const uint_t& level, std::shared_ptr< const gko::Executor > exec, const uint_t& velocityPreconditionerType = 1, const uint_t& pressurePreconditionerType = 1 ) : storage_( storage ) , solver_exec_( exec ) , host_exec_( exec->get_master() ) , comm_( gko::mpi::communicator::create( storage->getSplitCommunicatorByPrimitiveDistribution() ) ) , blockPreconditioner_( storage, level, level ) , num_( "numerator", storage, level, level ) , velocityPreconditionerType_( velocityPreconditionerType ) , pressurePreconditionerType_( pressurePreconditionerType ) , printInfo_( true ) {} void solve( const OperatorType& A, const typename OperatorType::srcType& x, const typename OperatorType::dstType& b, const walberla::uint_t level ) override { const auto& timingTree = x.getStorage()->getTimingTree(); timingTree->start( "Ginkgo CG Solver" ); const auto num_local_dofs = numberOfLocalDoFs< typename FunctionType::Tag >( *storage_, level ); const auto num_global_dofs = numberOfGlobalDoFs< typename FunctionType::Tag >( *storage_, level, comm_->get() ); if ( !part_ ) { auto [start, end] = local_range( num_local_dofs, comm_ ); part_ = gko::share( gko::distributed::Partition<>::build_from_local_range( host_exec_, start, end, comm_ ) ); } // maybe called in parallel, thus need to keep it for empty processes num_.copyBoundaryConditionFromFunction( x ); num_.enumerate( level ); auto x_gko = vec::create( host_exec_, comm_, part_, gko::dim< 2 >{ num_global_dofs, 1 }, gko::dim< 2 >{ num_local_dofs, 1 } ); auto b_gko = vec::create( host_exec_, comm_, part_, gko::dim< 2 >{ num_global_dofs, 1 }, gko::dim< 2 >{ num_local_dofs, 1 } ); hyteg::petsc::createVectorFromFunction( x, num_, std::make_shared< GinkgoVectorProxy >( x_gko.get(), gko::dim< 2 >{ num_global_dofs, 1 }, part_ ), level, All ); hyteg::petsc::createVectorFromFunction( b, num_, std::make_shared< GinkgoVectorProxy >( b_gko.get(), gko::dim< 2 >{ num_global_dofs, 1 }, part_ ), level, All ); if(!host_monolithic_matrix_ || reassembleMatrix_) { this->setup_ginkgo(A, x, level, b_gko.get()); } auto rhs = dir_handler_->get_rhs( b_gko.get(), x_gko.get() ); auto x0 = dir_handler_->get_initial_guess( x_gko.get() ); timingTree->start( "Ginkgo CG Solver Gather vectors" ); auto global_rhs = dense::create( host_exec_ ); auto global_x0 = dense::create( host_exec_ ); rhs->convert_to( gko::lend( global_rhs ) ); x0->convert_to( gko::lend( global_x0 ) ); timingTree->stop( "Ginkgo CG Solver Gather vectors" ); auto log = gko::share( gko::log::Convergence< valueType >::create( solver_exec_ ) ); if ( monolithic_matrix_->get_size() ) { auto factory = gko::clone( solver_->get_stop_criterion_factory() ); factory->add_logger( log ); solver_->set_stop_criterion_factory( gko::share( factory ) ); timingTree->start( "Ginkgo CG Solver Apply" ); if ( comm_->size() > 1 ) { auto permuted_global_rhs = gko::as< dense >( global_rhs->row_permute( &perm_ ) ); auto permuted_global_x0 = gko::as< dense >( global_x0->row_permute( &perm_ ) ); solver_->apply( gko::lend( permuted_global_rhs ), gko::lend( permuted_global_x0 ) ); permuted_global_x0->inverse_row_permute( &perm_, global_x0.get() ); } else { solver_->apply( gko::lend( global_rhs ), gko::lend( global_x0 ) ); } timingTree->stop( "Ginkgo CG Solver Apply" ); } WALBERLA_MPI_BARRIER(); // only necessary to get correct timings timingTree->start( "Ginkgo CG Solver Scatter vectors" ); gather_idxs_ = compute_gather_idxs( part_ ); scatter_global_vector( global_x0.get(), x0, gather_idxs_, comm_ ); timingTree->stop( "Ginkgo CG Solver Scatter vectors" ); dir_handler_->update_solution( x0 ); hyteg::petsc::createFunctionFromVector( x, num_, std::make_shared< GinkgoVectorProxy >( x0, gko::dim< 2 >{ num_global_dofs, 1 }, part_ ), level, All ); if ( printInfo_ && comm_->rank() == 0 ) { WALBERLA_LOG_INFO_ON_ROOT( "[Ginkgo CG]" << ( !log->has_converged() ? " NOT " : " " ) << "converged after " << log->get_num_iterations() << " iterations, residual norm: " << solver_exec_->copy_val_to_host( gko::as< gko::matrix::Dense< valueType > >( log->get_residual_norm() )->get_const_values() ) ); } timingTree->stop( "Ginkgo CG Solver" ); } void setPrintInfo( bool printInfo ) { printInfo_ = printInfo; } void setReassembleMatrix( bool reassembleMatrix ) { reassembleMatrix_ = reassembleMatrix; } private: void setup_ginkgo( const OperatorType& A, const typename OperatorType::srcType& x, const walberla::uint_t level, const vec* b_gko){ const auto& timingTree = x.getStorage()->getTimingTree(); const auto num_global_dofs = b_gko->get_size()[0]; std::vector< int32_t > bcIndices; hyteg::petsc::applyDirichletBC( num_, bcIndices, level ); std::sort( std::begin( bcIndices ), std::end( bcIndices ) ); timingTree->start( "Ginkgo CG Solver Set-up distributed matrix" ); host_monolithic_matrix_ = gko::share( mtx::create( host_exec_, comm_ ) ); dir_handler_ = std::make_unique< ZeroRowsDirichletHandler >( bcIndices, b_gko, host_monolithic_matrix_, true ); { auto proxy = std::make_shared< GinkgoSparseMatrixProxy< mtx > >( host_monolithic_matrix_.get(), gko::dim< 2 >{ num_global_dofs, num_global_dofs }, part_ ); hyteg::petsc::createMatrix( A, num_, num_, proxy, level, All ); proxy->finalize(); dir_handler_->update_matrix(); } timingTree->stop( "Ginkgo CG Solver Set-up distributed matrix" ); timingTree->start( "Ginkgo CG Solver Set-up distributed preconditioner" ); auto monolithic_preconditioner_ = gko::share( mtx::create( host_exec_, comm_ ) ); { auto proxy = std::make_shared< GinkgoSparseMatrixProxy< mtx > >( monolithic_preconditioner_.get(), gko::dim< 2 >{ num_global_dofs, num_global_dofs }, part_ ); hyteg::petsc::createMatrix( blockPreconditioner_, num_, num_, proxy, level, All ); proxy->finalize(); auto dir_handler_p = std::make_unique< ZeroRowsDirichletHandler >( bcIndices, b_gko, monolithic_preconditioner_, true ); dir_handler_p->update_matrix(); } timingTree->stop( "Ginkgo CG Solver Set-up distributed preconditioner" ); std::vector< int32_t > vIndices; std::vector< int32_t > pIndices; timingTree->start( "Ginkgo CG Solver Gather index sets" ); gatherIndices( vIndices, pIndices, *storage_, level, num_ ); std::sort( vIndices.begin(), vIndices.end() ); std::sort( pIndices.begin(), pIndices.end() ); timingTree->stop( "Ginkgo CG Solver Gather index sets" ); timingTree->start( "Ginkgo CG Solver Set-up permutation" ); std::vector< int32_t > perm_vec; std::vector< int32_t > recv_sizes( comm_->size() ); std::vector< int32_t > recv_offsets( comm_->size() + 1 ); auto local_size = static_cast< int32_t >( vIndices.size() ); gko::mpi::gather( &local_size, 1, recv_sizes.data(), 1, 0, comm_ ); std::partial_sum( recv_sizes.begin(), recv_sizes.end(), recv_offsets.begin() + 1 ); auto global_v_size = static_cast< gko::size_type >( recv_offsets.back() ); perm_vec.resize( global_v_size ); gko::mpi::gather( vIndices.data(), local_size, perm_vec.data(), recv_sizes.data(), recv_offsets.data(), 0, comm_ ); local_size = static_cast< int32_t >( pIndices.size() ); gko::mpi::gather( &local_size, 1, recv_sizes.data(), 1, 0, comm_ ); std::partial_sum( recv_sizes.begin(), recv_sizes.end(), recv_offsets.begin() + 1 ); auto global_p_size = static_cast< gko::size_type >( recv_offsets.back() ); perm_vec.resize( global_v_size + global_p_size ); gko::mpi::gather( pIndices.data(), local_size, perm_vec.data() + global_v_size, recv_sizes.data(), recv_offsets.data(), 0, comm_ ); perm_ = gko::Array< gko::int32 >{ solver_exec_, perm_vec.begin(), perm_vec.end() }; timingTree->stop( "Ginkgo CG Solver Set-up permutation" ); auto [p_v_min, p_v_max] = std::minmax_element( std::begin( vIndices ), std::end( vIndices ) ); auto [p_p_min, p_p_max] = std::minmax_element( std::begin( pIndices ), std::end( pIndices ) ); gko::span v_span( *p_v_min, *p_v_max + 1 ); gko::span p_span( *p_p_min, *p_p_max + 1 ); if ( p_span.begin != v_span.end ) { WALBERLA_ABORT( "Indices are NOT blocked: v" << v_span << ", p" << p_span ) } timingTree->start( "Ginkgo CG Solver Gather matrix" ); monolithic_matrix_ = gko::share( csr::create( solver_exec_ ) ); gko::as< mtx >( host_monolithic_matrix_ )->convert_to( monolithic_matrix_.get() ); timingTree->stop( "Ginkgo CG Solver Gather matrix" ); if ( comm_->size() > 1 && monolithic_matrix_->get_size() ) { timingTree->start( "Ginkgo CG Solver Permute matrix" ); monolithic_matrix_ = gko::as< csr >( monolithic_matrix_->permute( &perm_ ) ); timingTree->stop( "Ginkgo CG Solver Permute matrix" ); } { timingTree->start( "Ginkgo CG Solver Gather preconditioner" ); auto device_monolithic_pre = gko::share( csr::create( solver_exec_ ) ); monolithic_preconditioner_->convert_to( device_monolithic_pre.get() ); timingTree->stop( "Ginkgo CG Solver Gather preconditioner" ); if ( device_monolithic_pre->get_size() && device_monolithic_pre->get_size() ) { gko::span block_v_span{ 0, global_v_size }; gko::span block_p_span{ global_v_size, global_v_size + global_p_size }; timingTree->start( "Ginkgo CG Solver Permute preconditioner" ); device_monolithic_pre = gko::as< csr >( device_monolithic_pre->permute( &perm_ ) ); timingTree->stop( "Ginkgo CG Solver Permute preconditioner" ); std::shared_ptr< gko::LinOp > b_pre_v; std::shared_ptr< gko::LinOp > b_pre_p; timingTree->start( "Ginkgo CG Solver Build block preconditioner" ); auto jac_gen = gko::share( gko::preconditioner::Jacobi< valueType >::build().with_max_block_size( 1u ).on( solver_exec_ ) ); if ( velocityPreconditionerType_ == 0 ) { b_pre_v = gko::share( gko::matrix::Identity< valueType >::create( solver_exec_, gko::dim< 2 >( block_v_span.length(), block_v_span.length() ) ) ); } else if ( velocityPreconditionerType_ == 1 ) { b_pre_v = gko::share( jac_gen->generate( gko::share( device_monolithic_pre->create_submatrix( block_v_span, block_v_span ) ) ) ); } else if ( velocityPreconditionerType_ == 2 ) { using lower_isai = gko::preconditioner::Isai< gko::preconditioner::isai_type::general, valueType, indexType >; using upper_isai = gko::preconditioner::Isai< gko::preconditioner::isai_type::general, valueType, indexType >; auto ilu_gen = gko::share( gko::preconditioner::Ilu< lower_isai, upper_isai >::build() //.with_factorization_factory( gko::factorization::Ilu< valueType >::build().on( solver_exec_ ) ) // doesn't make a lot of difference .on( solver_exec_ ) ); auto smoother_gen = gko::share( gko::solver::Ir< valueType >::build() .with_solver( ilu_gen ) .with_relaxation_factor( 0.9 ) .with_criteria( gko::stop::Iteration::build().with_max_iters( 2u ).on( solver_exec_ ) ) .on( solver_exec_ ) ); auto mg_level_gen = gko::multigrid::AmgxPgm< valueType >::build().with_deterministic( true ).on( solver_exec_ ); auto coarsest_gen = gko::share( gko::solver::Ir< valueType >::build() .with_solver( jac_gen ) .with_relaxation_factor( 0.9 ) .with_criteria( gko::stop::Iteration::build().with_max_iters( 4u ).on( solver_exec_ ) ) .on( solver_exec_ ) ); b_pre_v = gko::share( gko::solver::Multigrid::build() .with_max_levels( 9u ) .with_min_coarse_rows( 500u ) .with_pre_smoother( smoother_gen ) .with_post_uses_pre( true ) .with_mg_level( gko::share( mg_level_gen ) ) .with_coarsest_solver( coarsest_gen ) .with_zero_guess( true ) .with_criteria( gko::stop::Iteration::build().with_max_iters( 1u ).on( solver_exec_ ) ) .on( solver_exec_ ) ->generate( gko::share( device_monolithic_pre->create_submatrix( block_v_span, block_v_span ) ) ) ); if ( printInfo_ ) { auto amg = gko::as< gko::solver::Multigrid >( b_pre_v ); WALBERLA_LOG_INFO_ON_ROOT( "[Ginkgo AMGX] Number of levels: " << amg->get_mg_level_list().size() + 1 ) for ( int i = 0; i < amg->get_mg_level_list().size(); ++i ) { WALBERLA_LOG_INFO_ON_ROOT( "[Ginkgo AMGX] Level " << amg->get_mg_level_list().size() - i << " of size " << amg->get_mg_level_list()[i]->get_fine_op()->get_size() ); } WALBERLA_LOG_INFO_ON_ROOT( "[Ginkgo AMGX] Level 0 of size " << amg->get_coarsest_solver()->get_size() ); } } else { WALBERLA_ABORT( "Invalid velocity preconditioner for Ginkgo block prec BiCGStab solver." ); } if ( pressurePreconditionerType_ == 0 ) { b_pre_p = gko::share( gko::matrix::Identity< valueType >::create( solver_exec_, gko::dim< 2 >( block_p_span.length(), block_p_span.length() ) ) ); } else if ( pressurePreconditionerType_ == 1 ) { b_pre_p = gko::share( jac_gen->generate( gko::share( device_monolithic_pre->create_submatrix( block_p_span, block_p_span ) ) ) ); } else { WALBERLA_ABORT( "Invalid pressure preconditioner for Ginkgo block prec BiCGStab solver." ); } auto b_pre_vp = gko::share( gko::matrix::Zero< valueType >::create( solver_exec_, gko::dim< 2 >{ global_v_size, global_p_size } ) ); auto b_pre_pv = gko::share( gko::matrix::Zero< valueType >::create( solver_exec_, gko::dim< 2 >{ global_p_size, global_v_size } ) ); block_preconditioner_ = gko::share( gko::matrix::BlockMatrix::create( solver_exec_, gko::dim< 2 >{ global_v_size + global_p_size, global_v_size + global_p_size }, std::vector< std::vector< std::shared_ptr< gko::LinOp > > >{ { b_pre_v, b_pre_vp }, { b_pre_pv, b_pre_p } }, std::vector< gko::span >{ block_v_span, block_p_span } ) ); timingTree->stop( "Ginkgo CG Solver Build block preconditioner" ); } WALBERLA_MPI_BARRIER(); } if ( monolithic_matrix_->get_size() ) { solver_ = gko::solver::Gmres< valueType >::build() .with_criteria( gko::share( gko::stop::ResidualNorm<>::build() .with_baseline( gko::stop::mode::initial_resnorm ) .with_reduction_factor( 1e-30 ) .on( solver_exec_ ) ), gko::share( gko::stop::ResidualNorm<>::build() .with_baseline( gko::stop::mode::absolute ) .with_reduction_factor( 1e-12 ) .on( solver_exec_ ) ), gko::share( gko::stop::Iteration::build().with_max_iters( 15000 ).on( solver_exec_ ) ) ) .with_generated_preconditioner( block_preconditioner_ ) .on( solver_exec_ ) ->generate( monolithic_matrix_ ); } } std::shared_ptr< PrimitiveStorage > storage_; std::shared_ptr< const gko::Executor > host_exec_; std::shared_ptr< const gko::Executor > solver_exec_; std::shared_ptr< gko::mpi::communicator > comm_; std::shared_ptr< gko::distributed::Partition< int32_t > > part_; std::unique_ptr< gko::solver::Gmres< valueType > > solver_; BlockPreconditioner_T blockPreconditioner_; typename OperatorType::srcType::template FunctionType< int > num_; std::shared_ptr< gko::LinOp > block_preconditioner_; std::shared_ptr< csr > monolithic_matrix_; std::shared_ptr< mtx > host_monolithic_matrix_; gko::Array< indexType > perm_; std::unique_ptr< ZeroRowsDirichletHandler > dir_handler_; std::vector< gko::Array< gko::distributed::global_index_type > > gather_idxs_; uint_t velocityPreconditionerType_; uint_t pressurePreconditionerType_; bool reassembleMatrix_ = false; bool printInfo_ = false; }; } // namespace hyteg