UniformGPUScheme.impl.h 9.38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
//======================================================================================================================
//
//  This file is part of waLBerla. waLBerla is free software: you can
//  redistribute it and/or modify it under the terms of the GNU General Public
//  License as published by the Free Software Foundation, either version 3 of
//  the License, or (at your option) any later version.
//
//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
//  for more details.
//
//  You should have received a copy of the GNU General Public License along
//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGPUScheme.impl.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================

22
#include "cuda/ParallelStreams.h"
23
24
25
26
27
28
29

namespace walberla {
namespace cuda {
namespace communication {


template<typename Stencil>
30
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
31
32
33
34
35
36
37
                                             bool sendDirectlyFromGPU,
                                             const int tag )
        : blockForest_( bf ),
          setupBeforeNextCommunication_( true ),
          communicationInProgress_( false ),
          sendFromGPU_( sendDirectlyFromGPU ),
          bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
38
39
40
          bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
          parallelSectionManager_( -1 )
   {}
41
42
43


   template<typename Stencil>
44
   void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream )
45
46
47
48
   {
      WALBERLA_ASSERT( !communicationInProgress_ );
      auto forest = blockForest_.lock();

49
50
      auto currentBlockForestStamp = forest->getBlockForest().getModificationStamp();
      if( setupBeforeNextCommunication_ || currentBlockForestStamp != forestModificationStamp_ )
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
         setupCommunication();

      // Schedule Receives
      if( sendFromGPU_ )
         bufferSystemGPU_.scheduleReceives();
      else
         bufferSystemCPU_.scheduleReceives();


      if( !sendFromGPU_ )
         for( auto it : headers_ )
            bufferSystemGPU_.sendBuffer( it.first ).clear();

      // Start filling send buffers
      {
66
67
         auto parallelSection = parallelSectionManager_.parallelSection( stream );
         for( auto &iBlock : *forest )
68
         {
69
70
            auto block = dynamic_cast< Block * >( &iBlock );
            for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
71
            {
72
73
74
75
               const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
               if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
                  continue;
               auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
76

77
               for( auto &pi : packInfos_ )
78
               {
79
80
81
82
83
84
85
86
87
88
89
90
91
                  parallelSection.run([&](auto s) {
                     auto size = pi->size( *dir, block );
                     auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
                     WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
                     pi->pack( *dir, gpuDataPtr, block, s );

                     if( !sendFromGPU_ )
                     {
                        auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                        WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ));
                     }
                  });
92
93
94
95
96
               }
            }
         }
      }

97
98
      // wait for packing to finish
      cudaStreamSynchronize( stream );
99
100
101
102
103
104
105
106
107
108
109

      if( sendFromGPU_ )
         bufferSystemGPU_.sendAll();
      else
         bufferSystemCPU_.sendAll();

      communicationInProgress_ = true;
   }


   template<typename Stencil>
110
   void UniformGPUScheme<Stencil>::wait( cudaStream_t stream )
111
112
113
114
115
116
117
   {
      WALBERLA_ASSERT( communicationInProgress_ );

      auto forest = blockForest_.lock();

      if( sendFromGPU_ )
      {
118
         auto parallelSection = parallelSectionManager_.parallelSection( stream );
119
120
         for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
         {
121
            recvInfo.buffer().clear();
122
123
124
125
126
127
128
129
130
            for( auto &header : headers_[recvInfo.rank()] )
            {
               auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));

               for( auto &pi : packInfos_ )
               {
                  auto size = pi->size( header.dir, block );
                  auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
131
132
133
                  parallelSection.run([&](auto s) {
                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                  });
134
135
136
137
138
139
               }
            }
         }
      }
      else
      {
140
         auto parallelSection = parallelSectionManager_.parallelSection( stream );
141
142
143
144
         for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
         {
            auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());

145
            recvInfo.buffer().clear();
146
147
148
149
150
151
152
153
154
155
156
157
            gpuBuffer.clear();
            for( auto &header : headers_[recvInfo.rank()] ) {
               auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));

               for( auto &pi : packInfos_ )
               {
                  auto size = pi->size( header.dir, block );
                  auto cpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                  auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );

158
159
160
161
162
                  parallelSection.run([&](auto s) {
                     WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
                                                           cudaMemcpyHostToDevice, s ));
                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                  });
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
               }
            }
         }
      }

      communicationInProgress_ = false;
   }


   template<typename Stencil>
   void UniformGPUScheme<Stencil>::setupCommunication()
   {
      auto forest = blockForest_.lock();

      headers_.clear();

      std::map<mpi::MPIRank, mpi::MPISize> receiverInfo; // how many bytes to send to each neighbor

      mpi::BufferSystem headerExchangeBs( mpi::MPIManager::instance()->comm(), 123 );

      for( auto &iBlock : *forest ) {
         auto block = dynamic_cast< Block * >( &iBlock );

         for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) {
            // skip if block has no neighbors in this direction
            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
            if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
               continue;

            WALBERLA_ASSERT( block->neighborhoodSectionHasEquallySizedBlock( neighborIdx ),
                             "Works for uniform setups only" );
            WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize( neighborIdx ), uint_t( 1 ),
                                   "Works for uniform setups only" );

            const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 ));
            auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));

            for( auto &pi : packInfos_ )
               receiverInfo[nProcess] += mpi::MPISize( pi->size( *dir, block ));

            auto &headerBuffer = headerExchangeBs.sendBuffer( nProcess );
            nBlockId.toBuffer( headerBuffer );
            headerBuffer << *dir;
         }
      }

      headerExchangeBs.setReceiverInfoFromSendBufferState( false, true );
      headerExchangeBs.sendAll();
      for( auto recvIter = headerExchangeBs.begin(); recvIter != headerExchangeBs.end(); ++recvIter ) {
         auto &headerVector = headers_[recvIter.rank()];
         auto &buffer = recvIter.buffer();
         while ( buffer.size()) {
            Header header;
            header.blockId.fromBuffer( buffer );
            buffer >> header.dir;
            headerVector.push_back( header );
         }
      }

      bufferSystemCPU_.setReceiverInfo( receiverInfo );
      bufferSystemGPU_.setReceiverInfo( receiverInfo );

      for( auto it : receiverInfo ) {
226
227
         bufferSystemCPU_.sendBuffer( it.first ).resize( size_t(it.second) );
         bufferSystemGPU_.sendBuffer( it.first ).resize( size_t(it.second) );
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
      }

      forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
      setupBeforeNextCommunication_ = false;
   }


   template<typename Stencil>
   void UniformGPUScheme<Stencil>::addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi )
   {
      WALBERLA_ASSERT( !communicationInProgress_, "Cannot add pack info while communication is in progress" );
      packInfos_.push_back( pi );
      setupBeforeNextCommunication_ = true;
   }


} // namespace communication
} // namespace cuda
} // namespace walberla