diff --git a/CMakeLists.txt b/CMakeLists.txt index b5a05f3adba4b31936f07b7a450c0a23a2237b99..5a1148ba73865f34c409c7246a576e7639d98cb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,8 @@ option ( WALBERLA_BUILD_WITH_PYTHON_LBM "Include LBM module into python modu option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr" ) +option ( WALBERLA_BUILD_WITH_CUDA "Enable CUDA support" ) + option ( WALBERLA_BUILD_WITH_FASTMATH "Fast math" ) @@ -1013,6 +1015,45 @@ endif() +############################################################################################################################ +## +## CUDA +## +############################################################################################################################ +if ( WALBERLA_BUILD_WITH_CUDA ) + # set ( BUILD_SHARED_LIBS ON ) + set ( CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON ) + set ( CUDA_PROPAGATE_HOST_FLAGS OFF CACHE BOOL "" ) + + if ( (NOT DEFINED CUDA_HOST_COMPILER) AND (${CMAKE_C_COMPILER} MATCHES "ccache") ) + string ( STRIP "${CMAKE_C_COMPILER_ARG1}" stripped_compiler_string ) + find_program ( CUDA_HOST_COMPILER ${stripped_compiler_string} ) + endif () + + find_package ( CUDA REQUIRED ) + + if ( CUDA_FOUND ) + include_directories ( ${CUDA_INCLUDE_DIRS} ) + list ( APPEND SERVICE_LIBS ${CUDA_LIBRARIES} ) + + if ( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=" ) + list ( APPEND CUDA_NVCC_FLAGS "-std=c++11" ) + endif () + + # Bug with gcc5 and cuda7.5: + #list( APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES -D__STRICT_ANSI__") + + # NOTICE: exisiting cuda flags are overwritten + #set ( CUDA_NVCC_FLAGS "--compiler-bindir=/usr/bin/g++-4.3" ) + #set ( CUDA_NVCC_FLAGS "-arch sm_20" ) + else() + set ( WALBERLA_BUILD_WITH_CUDA FALSE ) + endif ( ) +endif ( ) +############################################################################################################################ + + + ############################################################################################################################ ## ## Testing Coverage diff --git a/apps/tutorials/CMakeLists.txt b/apps/tutorials/CMakeLists.txt index 91dae24cd697064fe40b1490b45f7a863f4fffab..1a6e321a629c82de13c878c1a58ca2ccff45fbda 100644 --- a/apps/tutorials/CMakeLists.txt +++ b/apps/tutorials/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(basics) +add_subdirectory(cuda) add_subdirectory(lbm) add_subdirectory(pde) add_subdirectory(pe) diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f8da219715bd1e5b74601b2fc683b59c1600346 --- /dev/null +++ b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp @@ -0,0 +1,134 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file 03_GameOfLife.cpp +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "01_GameOfLife_kernels.h" +#include "cuda/HostFieldAllocator.h" +#include "blockforest/Initialization.h" +#include "blockforest/communication/UniformDirectScheme.h" + +#include "core/Environment.h" + +#include "cuda/HostFieldAllocator.h" +#include "cuda/FieldCopy.h" +#include "cuda/GPUField.h" +#include "cuda/Kernel.h" +#include "cuda/AddGPUFieldToStorage.h" + +#include "field/AddToStorage.h" +#include "field/communication/UniformMPIDatatypeInfo.h" + +#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h" +#include "geometry/structured/GrayScaleImage.h" + +#include "gui/Gui.h" + +#include "stencil/D2Q9.h" + +#include "timeloop/SweepTimeloop.h" + + +using namespace walberla; + +typedef GhostLayerField<double,1> ScalarField; +typedef cuda::GPUField<double> GPUField; + + +ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new ScalarField ( + storage->getNumberOfXCells( *block ), // number of cells in x direction per block + storage->getNumberOfYCells( *block ), // number of cells in y direction per block + storage->getNumberOfZCells( *block ), // number of cells in z direction per block + 1, // one ghost layer + real_t(0), // initial value + field::fzyx, // layout + make_shared<cuda::HostFieldAllocator<double> >() // allocator for host pinned memory + ); +} + +class GameOfLifeSweepCUDA +{ + public: + GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID ) + : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID ) + { + } + void operator() ( IBlock * block ) + { + auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ ); + auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ ); + + auto myKernel = cuda::make_kernel( &gameOfLifeKernel ); + myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) ); + myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) ); + myKernel(); + + srcCudaField->swapDataPointers( dstCudaField ); + } + private: + BlockDataID gpuFieldSrcID_; + BlockDataID gpuFieldDstID_; +}; + + +int main( int argc, char ** argv ) +{ + walberla::Environment env( argc, argv ); + + geometry::GrayScaleImage image ("GosperGliderGun.png"); + + // Create blocks + shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid ( + uint_t(1) , uint_t(2), uint_t(1), // number of blocks in x,y,z direction + image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z) + real_t(1), // dx: length of one cell in physical coordinates + false, // one block per process - "false" means all blocks to one process + false, false, false ); // no periodicity + + + BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" ); + + // Initializing the field from an image + using geometry::initializer::ScalarFieldFromGrayScaleImage; + ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ; + fieldInitializer.init( image, uint_t(2), false ); + + BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" ); + BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" ); + + typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme; + CommScheme communication( blocks ); + communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) ); + + // Create Timeloop + const uint_t numberOfTimesteps = uint_t(10); // number of timesteps for non-gui runs + SweepTimeloop timeloop ( blocks, numberOfTimesteps ); + + // Registering the sweep + timeloop.add() << BeforeFunction( communication, "Communication" ) + << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" ); + + timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) ); + + GUI gui ( timeloop, blocks, argc, argv ); + gui.run(); + + return 0; +} diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.dox b/apps/tutorials/cuda/01_GameOfLife_cuda.dox new file mode 100644 index 0000000000000000000000000000000000000000..4e654d4c09e7fbb1f8b38f63662e02ad360e2ae2 --- /dev/null +++ b/apps/tutorials/cuda/01_GameOfLife_cuda.dox @@ -0,0 +1,139 @@ +namespace walberla{ + +/** +\page tutorial_cuda01 Tutorial - CUDA 1: Game of Life on GPU + + +\image html tutorial_cuda01_nvidia_titan.png + +> _Note:_ This tutorial required a CUDA aware MPI library. +> If you get a SEGFAULT when executing this tutorial, make sure that your MPI library was built with +> CUDA support! For instructions how to build OpenMPI with CUDA see this [page](https://www.open-mpi.org/faq/?category=building#build-cuda). + +\section cuda01_fields Creating Fields + +To run a simulation on a NVIDIA graphics card, we have to allocate data on the GPU and +write a CUDA kernel that operates on this data. In this tutorial we first allocate a field on the GPU +and learn about functionality to transfer data between CPU and GPU fields. + +Since initialization and output routines are usually not time critical, they are implemented +for CPU fields only. In waLBerla we set up the complete simulation using +CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the +end, copy everything back to do the output from the CPU field. +So only the time critical kernels have to be written in CUDA. + +Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented +in a previous tutorial ( \ref tutorial_basics_03 ). +One difference is, that fields which are often transfered from/to the GPU should be allocated with +a different field allocator: cuda::HostFieldAllocator . This allocator uses cudaHostAlloc() instead of "new" , +such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk. +Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not +mandatory, the data transfer functions work (slightly slower) also with normally allocated fields. + +\code +ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new ScalarField ( + storage->getNumberOfXCells( *block ), // number of cells in x direction per block + storage->getNumberOfYCells( *block ), // number of cells in y direction per block + storage->getNumberOfZCells( *block ), // number of cells in z direction per block + 1, // one ghost layer + real_t(0), // initial value + field::fzyx, // layout + make_shared<cuda::HostFieldAllocator<double> >() // allocator for host pinned memory + ); +} +\endcode + +Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 . +Then two GPU fields are created: "source" and "destination" field. The helper function +cuda::addGPUFieldToStorage() creates a cuda::GPUField field of the same size and layout of the given +CPU field: +\code +BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" ); +BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" ); +\endcode +The contents of the new GPU fields are initialized with the contents of the given CPU field. + + + +\section cuda01_kernels Writing and calling CUDA kernels + +For a basic understanding of the CUDA support in waLBerla please read \ref cudaPage first. + +After reading this page you should know what a FieldAccessor is and how to call CUDA kernels from +cpp files. So we can now start with writing +a CUDA kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension. +The build system then automatically detects that this file should be compiled with the CUDA C++ compiler. + +The kernel gets two field accessors as arguments, one for the source and one for the destination field. +Both accessors have to be configured using the CUDA variables blockIdx and threadIdx, such that afterwards +the get() and getNeighbor() functions of the accessor class can work correctly. +\code +__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst ) +{ + src.set( blockIdx, threadIdx ); + dst.set( blockIdx, threadIdx ); + int liveNeighbors = 0; + if ( src.getNeighbor( 1, 0,0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors; + // normal Game of Life algorithm .... + // ... +} +\endcode + +To call this kernel we write a thin wrapper sweep which only has to get the GPU fields out of the blockstorage +and passes them to the CUDA kernel. We use the cuda::Kernel class from waLBerla here, so that we can write this +sweep in a normal cpp file. +Here are the contents of this sweep: +\code +auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ ); +auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ ); + +auto myKernel = cuda::make_kernel( &gameOfLifeKernel ); +myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) ); +myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) ); +myKernel(); + +srcCudaField->swapDataPointers( dstCudaField ); +\endcode + +All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for +setup reasons. + +To see if our kernel works, we copy the contents back to the CPU field after every timestep: +\code +timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) ); +\endcode +Of course this makes no sense for real simulations, since the transfer time is much higher than the +time that was saved by doing the computation on the GPU. For production runs, one would usually transfer the +field back every n'th timestep and write e.g. a VTK frame. + + +\section cuda01_comm Communication + +In waLBerla there are two types of communication: _buffered_ and _direct_ communication. +While buffered communication first collects all data in a buffer and sends only one message per communciation step and neighbor +the direct communciation strategy, which is based on MPI datatypes, uses no intermediate buffers and therefore has to send +more messages than buffered communication. For details see \ref walberla_communication . + +In the tutorials up to now, only the buffered approach was used. In this tutorial, we switch to the direct communciation strategy +because then we can use the CUDA support of the MPI library to directly communciate from/to GPU memory. + +The usage of the two different communication schemes is very similar. Instead of creating a blockforest::communication::UniformBufferedScheme +we create a blockforest::communication::UniformDirectScheme. +Then we register a field::communication::UniformMPIDatatypeInfo instead of the field::communication::PackInfo. + +\code +typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme; +CommScheme communication( blocks ); +communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) ); +\endcode + +This scheme also supports heterogenous simulations, i.e. using a CPU field on +some processes and a GPU field on other processes. + +*/ + + +} diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.cu b/apps/tutorials/cuda/01_GameOfLife_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..399f705c82e29d3d62b6f2c2a6db7ffaee6639ff --- /dev/null +++ b/apps/tutorials/cuda/01_GameOfLife_kernels.cu @@ -0,0 +1,40 @@ +#include "../cuda/01_GameOfLife_kernels.h" + +#include <iostream> + + + +namespace walberla { + + +__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst ) +{ + src.set( blockIdx, threadIdx ); + dst.set( blockIdx, threadIdx ); + + // Count number of living neighbors + int liveNeighbors = 0; + if ( src.getNeighbor( 1, 0,0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( 0,+1,0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( 0,-1,0 ) > 0.5 ) ++liveNeighbors; + + if ( src.getNeighbor( -1, -1, 0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( -1, +1, 0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( +1, -1,0 ) > 0.5 ) ++liveNeighbors; + if ( src.getNeighbor( +1, +1,0 ) > 0.5 ) ++liveNeighbors; + + + // cell dies because of under- or over-population + if ( liveNeighbors < 2 || liveNeighbors > 3 ) + dst.get() = 0.0; + else if ( liveNeighbors == 3 ) // cell comes alive + dst.get() = 1.0; + else + dst.get() = src.get(); +} + + + + +} // namespace walberla diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.h b/apps/tutorials/cuda/01_GameOfLife_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..e0e30c85a17f815085ee5b008159043307becd71 --- /dev/null +++ b/apps/tutorials/cuda/01_GameOfLife_kernels.h @@ -0,0 +1,12 @@ +#include <iostream> + +#include "cuda/FieldIndexing.h" + + +namespace walberla { + + +__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst ); + + +} // namespace walberla diff --git a/apps/tutorials/cuda/CMakeLists.txt b/apps/tutorials/cuda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..efa4d2a554d84d69d4606594c4c2a50d4f65cf8e --- /dev/null +++ b/apps/tutorials/cuda/CMakeLists.txt @@ -0,0 +1,7 @@ +waLBerla_link_files_to_builddir( *.prm ) +waLBerla_link_files_to_builddir( *.png ) + +waLBerla_add_executable ( NAME 01_GameOfLife_cuda + FILES 01_GameOfLife_cuda.cpp 01_GameOfLife_kernels.cu + DEPENDS blockforest core cuda field lbm geometry timeloop gui ) + \ No newline at end of file diff --git a/apps/tutorials/cuda/GosperGliderGun.png b/apps/tutorials/cuda/GosperGliderGun.png new file mode 100644 index 0000000000000000000000000000000000000000..f70d7036c6ae62d1f8cc2d1ea198fe90797b8747 Binary files /dev/null and b/apps/tutorials/cuda/GosperGliderGun.png differ diff --git a/cmake/waLBerlaFunctions.cmake b/cmake/waLBerlaFunctions.cmake index 2bd75e087b8d2cea4da319e389b49a0c90957faa..cf594b58fda76ea7303b07cadab6fffc00eac5a9 100644 --- a/cmake/waLBerlaFunctions.cmake +++ b/cmake/waLBerlaFunctions.cmake @@ -84,13 +84,17 @@ function ( waLBerla_add_module ) set( hasSourceFiles FALSE ) foreach ( sourceFile ${sourceFiles} ) - if ( ${sourceFile} MATCHES "\\.(c|cpp)" ) + if ( ${sourceFile} MATCHES "\\.(c|cpp|cu)" ) set( hasSourceFiles TRUE ) endif( ) endforeach( ) - if ( hasSourceFiles ) - add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} ) + if ( hasSourceFiles ) + if ( CUDA_FOUND ) + cuda_add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} ) + else() + add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} ) + endif( CUDA_FOUND ) else( ) add_custom_target( ${moduleLibraryName} SOURCES ${sourceFiles} ${otherFiles} ) # dummy IDE target endif( ) @@ -194,7 +198,13 @@ function ( waLBerla_add_executable ) endif ( ) endif() - add_executable( ${ARG_NAME} ${sourceFiles} ) + if ( CUDA_FOUND ) + cuda_add_executable( ${ARG_NAME} ${sourceFiles} ) + else() + add_executable( ${ARG_NAME} ${sourceFiles} ) + endif() + + #add_executable( ${ARG_NAME} ${sourceFiles} ) target_link_modules ( ${ARG_NAME} ${ARG_DEPENDS} ) target_link_libraries( ${ARG_NAME} ${SERVICE_LIBS} ) diff --git a/src/cuda/AddGPUFieldToStorage.h b/src/cuda/AddGPUFieldToStorage.h new file mode 100644 index 0000000000000000000000000000000000000000..67968b93e2534c8750e6c8600985c90cafeee65f --- /dev/null +++ b/src/cuda/AddGPUFieldToStorage.h @@ -0,0 +1,72 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AddGPUFieldToStorage.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "GPUField.h" + +#include "domain_decomposition/StructuredBlockStorage.h" + +#include <boost/bind.hpp> + + +namespace walberla { +namespace cuda { + + + + //******************************************************************************************************************* + /*! Adds a cuda::GPUField to a StructuredBlockStorage + * + * - Similar to walberla::field::addToStorage() functions + * - created field is uninitialized + */ + //******************************************************************************************************************* + template< typename GPUField_T> + BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs, + const std::string & identifier, + uint_t fSize, + const Layout layout = fzyx, + uint_t nrOfGhostLayers = 1 ); + + + + //******************************************************************************************************************* + /*! Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field + * + * - adds a GPU field to a StructuredBlockStorage using a CPU field + * - sizes, number of ghostlayers and layout are the same as the CPU field + * - GPU field is initialized with the data currently stored in the CPU field + * @tparam Field_T type of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type> + */ + //******************************************************************************************************************* + template< typename Field_T> + BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs, + ConstBlockDataID cpuFieldID, + const std::string & identifier ); + + + +} // namespace cuda +} // namespace walberla + + +#include "AddGPUFieldToStorage.impl.h" diff --git a/src/cuda/AddGPUFieldToStorage.impl.h b/src/cuda/AddGPUFieldToStorage.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..dfde01a84f2dd4cda9dff4ab19ac52e6ab967c9d --- /dev/null +++ b/src/cuda/AddGPUFieldToStorage.impl.h @@ -0,0 +1,91 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AddGPUFieldToStorage.impl.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + + +namespace walberla { +namespace cuda { + + + namespace internal + { + template< typename GPUField_T> + GPUField_T * createGPUField( const IBlock * const block, + const StructuredBlockStorage * const bs, + uint_t ghostLayers, + uint_t fSize, + const field::Layout & layout ) + { + return new GPUField_T( bs->getNumberOfXCells( *block ), + bs->getNumberOfYCells( *block ), + bs->getNumberOfZCells( *block ), + fSize, ghostLayers, layout ); + } + + template< typename Field_T> + GPUField< typename Field_T::value_type> * + createGPUFieldFromCPUField( const IBlock * const block, + const StructuredBlockStorage * const, + ConstBlockDataID cpuFieldID + ) + { + typedef GPUField< typename Field_T::value_type> GPUField_T; + + const Field_T * f = block->getData<Field_T>( cpuFieldID ); + auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(), + f->nrOfGhostLayers(), f->layout() ); + + cuda::fieldCpy( *gpuField, *f ); + + return gpuField; + } + + } + + + template< typename GPUField_T> + BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs, + const std::string & identifier, + uint_t fSize, + const Layout layout, + uint_t nrOfGhostLayers ) + { + auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout ); + return bs->addStructuredBlockData< GPUField_T >( func, identifier ); + } + + + template< typename Field_T> + BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs, + ConstBlockDataID cpuFieldID, + const std::string & identifier ) + { + auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID ); + return bs->addStructuredBlockData< GPUField<typename Field_T::value_type> >( func, identifier ); + } + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6959180c2cc2c2df40633188487a001d0ff1c380 --- /dev/null +++ b/src/cuda/CMakeLists.txt @@ -0,0 +1,9 @@ +################################################################################################### +# +# Module cuda +# +################################################################################################### + +waLBerla_add_module( DEPENDS core domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA ) + +################################################################################################### \ No newline at end of file diff --git a/src/cuda/ErrorChecking.h b/src/cuda/ErrorChecking.h new file mode 100644 index 0000000000000000000000000000000000000000..49b2167445db23614e744020c76283d0fc0a72d4 --- /dev/null +++ b/src/cuda/ErrorChecking.h @@ -0,0 +1,53 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file ErrorChecking.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/Abort.h" + +#include <sstream> +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { + + +#define WALBERLA_CUDA_CHECK(ans) { ::walberla::cuda::checkForError((ans), __FILE__, __LINE__); } + + + +inline void checkForError( cudaError_t code, const std::string & callerPath, const int line ) +{ + if(code != cudaSuccess) + { + std::stringstream ss; + ss << "CUDA Error: " << cudaGetErrorString( code ); + Abort::instance()->abort( ss.str(), callerPath, line ); + } +} + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldAccessor.h b/src/cuda/FieldAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..bf45831bdf55c2536281959e5c16aaa2b3748144 --- /dev/null +++ b/src/cuda/FieldAccessor.h @@ -0,0 +1,112 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file SimpleFieldAccessor.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include <cuda_runtime.h> +#include "core/DataTypes.h" + + +namespace walberla { +namespace cuda { + + + + template<typename T> + class FieldAccessor + { + public: + enum IndexingScheme { FZYX, FZY, FZ, F, + ZYXF, ZYX, ZY, Z + }; + + FieldAccessor( char * ptr, + uint32_t xOffset, + uint32_t yOffset, + uint32_t zOffset, + uint32_t fOffset, + IndexingScheme indexingScheme ) + : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), + fOffset_(fOffset), indexingScheme_(indexingScheme ) + {} + + __device__ void set( uint3 blockIdx, uint3 threadIdx ) + { + switch ( indexingScheme_) + { + case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break; + case FZY : ptr_ += blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break; + case FZ : ptr_ += blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break; + case F : ptr_ += threadIdx.x * fOffset_; break; + + case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break; + case ZYX : ptr_ += blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break; + case ZY : ptr_ += blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break; + case Z : ptr_ += threadIdx.x * zOffset_; break; + } + } + + + __device__ unsigned int getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim ) + { + return threadIdx.x + + blockIdx.x * blockDim.x + + blockIdx.y * blockDim.x * gridDim.x + + blockIdx.z * blockDim.x * gridDim.x * gridDim.y ; + } + + + __device__ T & get() { return * (T*)(ptr_); } + __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); } + + + __device__ T & getNeighbor( int cx, int cy, int cz ) const + { + return * (T*)( ptr_ + cx * (int)(xOffset_) + + cy * (int)(yOffset_) + + cz * (int)(zOffset_) ); + } + + __device__ T & getNeighbor( int cx, int cy, int cz, int cf ) + { + return * (T*)( ptr_ + cx * (int)(xOffset_) + + cy * (int)(yOffset_) + + cz * (int)(zOffset_) + + cf * (int)(fOffset_) ); + } + + + protected: + char * ptr_; + uint32_t xOffset_; + uint32_t yOffset_; + uint32_t zOffset_; + uint32_t fOffset_; + IndexingScheme indexingScheme_; + }; + + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldAccessorXYZ.h b/src/cuda/FieldAccessorXYZ.h new file mode 100644 index 0000000000000000000000000000000000000000..4e43dd1996600d1f24d3249104ef289eb9d8cfde --- /dev/null +++ b/src/cuda/FieldAccessorXYZ.h @@ -0,0 +1,79 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldAccessorXYZ.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + + +#include <cuda_runtime.h> + +#include "core/DataTypes.h" + +namespace walberla { +namespace cuda { + + + template<typename T> + class FieldAccessorXYZ + { + public: + FieldAccessorXYZ( char * ptr, size_t xOffset, size_t yOffset, size_t zOffset, size_t fOffset ) + : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset) + {} + + __device__ void set( uint3 blockIdx, uint3 threadIdx ) + { + ptr_ += threadIdx.x * xOffset_ + + blockIdx.x * yOffset_ + + blockIdx.y * zOffset_ ; + } + + __device__ T & get() { return * (T*)(ptr_); } + __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); } + + + __device__ T & getNeighbor( int cx, int cy, int cz ) const + { + return * (T*)( ptr_ + cx * (int)(xOffset_) + + cy * (int)(yOffset_) + + cz * (int)(zOffset_) ); + } + + __device__ T & getNeighbor( int cx, int cy, int cz, int cf ) + { + return * (T*)( ptr_ + cx * (int)(xOffset_) + + cy * (int)(yOffset_) + + cz * (int)(zOffset_) + + cf * (int)(fOffset_) ); + } + + protected: + char * ptr_; + size_t xOffset_; + size_t yOffset_; + size_t zOffset_; + size_t fOffset_; + }; + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldCopy.h b/src/cuda/FieldCopy.h new file mode 100644 index 0000000000000000000000000000000000000000..e51f262522926490de80b618f8e4a9e93a95121b --- /dev/null +++ b/src/cuda/FieldCopy.h @@ -0,0 +1,209 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldCopy.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "ErrorChecking.h" +#include "GPUField.h" + +#include "domain_decomposition/StructuredBlockStorage.h" +#include "field/Field.h" +#include "field/GhostLayerField.h" + +#include "core/Abort.h" +#include "core/logging/Logging.h" + +#include <cuda_runtime.h> + +namespace walberla { +namespace cuda { + + + template<typename DstType, typename SrcType> + void fieldCpy( const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID dstID, ConstBlockDataID srcID ) + { + for ( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + { + DstType * dst = blockIt->getData<DstType>( dstID ); + const SrcType * src = blockIt->getData<SrcType>( srcID ); + fieldCpy( *dst, *src ); + } + } + + template<typename DstType, typename SrcType> + boost::function<void()> fieldCpyFunctor( const shared_ptr< StructuredBlockStorage > & blocks, + BlockDataID dstID, ConstBlockDataID srcID ) + { + return boost::bind( fieldCpy<DstType,SrcType>, blocks, dstID, srcID ); + } + + + + template<typename DstType, typename SrcType> + void fieldCpySweepFunction( BlockDataID dstID, ConstBlockDataID srcID, IBlock * block ) + { + DstType * dst = block->getData<DstType>( dstID ); + const SrcType * src = block->getData<SrcType>( srcID ); + fieldCpy( *dst, *src ); + } + + template<typename DstType, typename SrcType> + boost::function<void(IBlock*)> fieldCpyFunctor( BlockDataID dstID, ConstBlockDataID srcID ) + { + return boost::bind( fieldCpySweepFunction<DstType,SrcType>, dstID, srcID, _1 ); + } + + + + + + template<typename T, uint_t fs> + void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src ); + + + + template<typename T, uint_t fs> + void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src ); + + + + + //=================================================================================================================== + // + // Implementation + // + //=================================================================================================================== + + + + + template<typename T, uint_t fs> + void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src ) + { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + + if ( dst.layout() != src.layout() ) { + WALBERLA_ABORT( "Cannot copy fields with different layout" ); + } + + bool canCopy = ( src.layout() == fzyx && + dst.fAllocSize() == src.fAllocSize() && + dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() ) + || + ( src.layout() == zyxf && + dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() && + dst.xAllocSize() == src.xAllocSize() ); + + if ( !canCopy ) { + WALBERLA_ABORT("Field have to have the same size "); + } + + if ( dst.layout() == fzyx ) + { + p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer + sizeof(T) * src.xAllocSize(), // pitch + src.xAllocSize(), // inner dimension size + src.yAllocSize() ); // next outer dimension size + + p.extent.width = src.xAllocSize() * sizeof(T); + p.extent.height = src.yAllocSize(); + p.extent.depth = src.zAllocSize() * src.fAllocSize(); + } + else + { + p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer + sizeof(T) * src.fAllocSize(), // pitch + src.fAllocSize(), // inner dimension size + src.xAllocSize() ); // next outer dimension size + + p.extent.width = src.fAllocSize() * sizeof(T); + p.extent.height = src.xAllocSize(); + p.extent.depth = src.yAllocSize() * src.zAllocSize(); + } + + p.dstPtr = dst.pitchedPtr(); + p.kind = cudaMemcpyHostToDevice; + WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) ); + } + + + + template<typename T, uint_t fs> + void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src ) + { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + if ( dst.layout() != src.layout() ) { + WALBERLA_ABORT( "Cannot copy fields with different layout" ); + } + + bool canCopy = ( src.layout() == fzyx && + dst.fAllocSize() == src.fAllocSize() && + dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() ) + || + ( src.layout() == zyxf && + dst.zAllocSize() == src.zAllocSize() && + dst.yAllocSize() == src.yAllocSize() && + dst.xAllocSize() == src.xAllocSize() ); + + if ( !canCopy ) { + WALBERLA_ABORT("Field have to have the same size "); + } + + if ( dst.layout() == fzyx ) + { + p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()), // pointer + sizeof(T) * dst.xAllocSize(), // pitch + dst.xAllocSize(), // inner dimension size + dst.yAllocSize() ); // next outer dimension size + + p.extent.width = dst.xAllocSize() * sizeof(T); + p.extent.height = dst.yAllocSize(); + p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); + } + else + { + p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()), // pointer + sizeof(T) * dst.fAllocSize(), // pitch + dst.fAllocSize(), // inner dimension size + dst.xAllocSize() ); // next outer dimension size + + p.extent.width = dst.fAllocSize() * sizeof(T); + p.extent.height = dst.xAllocSize(); + p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); + } + + p.srcPtr = src.pitchedPtr(); + p.kind = cudaMemcpyDeviceToHost; + WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) ); + + } + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexing.cpp b/src/cuda/FieldIndexing.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a61e50a0d965d2528e46541623b702eb4b677c70 --- /dev/null +++ b/src/cuda/FieldIndexing.cpp @@ -0,0 +1,225 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file SimpleFieldIndexing.cpp +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "FieldIndexing.h" +#include "GPUTypesExplicitInstantiation.h" +#include "GPUField.h" + +#include "core/cell/CellInterval.h" +#include "core/debug/Debug.h" +#include "core/logging/Logging.h" +#include "field/Layout.h" + +#include <cuda_runtime.h> + +#include <limits> +#include <cmath> + +namespace walberla { +namespace cuda { + +template< typename T> +FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field, + uint3 _blockDim, uint3 _gridDim, + const FieldAccessor<T> _gpuAccess ) + : field_( field ), + blockDim_( _blockDim ), + gridDim_( _gridDim ), + gpuAccess_( _gpuAccess ) +{ + WALBERLA_DEBUG_SECTION() + { + cudaDeviceProp prop; + int count; + cudaGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits<int>::max(); + for (int i = 0; i < count; i++) { + cudaGetDeviceProperties(&prop, i); + threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + } + WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << + "than the maximal thread count per CUDA block." ); + } +} + + +template< typename T> +void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::IndexingScheme & indexing, dim3 & gridDim, dim3 & blockDim ) +{ + bool runLoop = true; + while( blockDim.x == 1 && runLoop ) + { + blockDim.x = gridDim.x; + gridDim.x = gridDim.y; + gridDim.y = gridDim.z; + gridDim.z = 1; + + + switch ( indexing ) { + case FieldAccessor<T>::FZYX: indexing = FieldAccessor<T>::FZY; break; + case FieldAccessor<T>::FZY : indexing = FieldAccessor<T>::FZ ; break; + case FieldAccessor<T>::FZ : indexing = FieldAccessor<T>::F ; break; + + case FieldAccessor<T>::ZYXF: indexing = FieldAccessor<T>::ZYX ; break; + case FieldAccessor<T>::ZYX : indexing = FieldAccessor<T>::ZY ; break; + case FieldAccessor<T>::ZY : indexing = FieldAccessor<T>::Z ; break; + + // iteration goes only over a single element - stop the loop + case FieldAccessor<T>::Z: runLoop = false; break; + case FieldAccessor<T>::F: runLoop = false; break; + } + } +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd ) +{ + unsigned int xOffset, yOffset, zOffset, fOffset; + + if ( f.layout() == field::zyxf ) + { + fOffset = sizeof(T); + xOffset = uint32_c( f.pitchedPtr().pitch ); + yOffset = xOffset * uint32_c( f.xAllocSize() ); + zOffset = yOffset * uint32_c( f.yAllocSize() ); + } + else + { + xOffset = sizeof(T); + yOffset = uint32_c( f.pitchedPtr().pitch ); + zOffset = yOffset * uint32_c( f.yAllocSize() ); + fOffset = zOffset * uint32_c( f.zAllocSize() ); + } + char * data = (char*)f.pitchedPtr().ptr; + + // Jump over ghost cells to first inner cell + cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() ); + data += ( ci.xMin() + gl )* int_c(xOffset) + + ( ci.yMin() + gl )* int_c(yOffset) + + ( ci.zMin() + gl )* int_c(zOffset); + + + dim3 gridDim; + dim3 blockDim; + typename FieldAccessor<T>::IndexingScheme firstCoord; + if ( f.layout() == fzyx ) + { + firstCoord = FieldAccessor<T>::FZYX; + blockDim = dim3 ( (unsigned int)ci.xSize(), 1, 1 ); + gridDim = dim3 ( (unsigned int)ci.ySize(), (unsigned int)ci.zSize(), (unsigned int)( fEnd - fBegin) ); + } + else + { + firstCoord = FieldAccessor<T>::ZYXF; + blockDim = dim3 ( (unsigned int)(fEnd - fBegin), 1, 1 ); + gridDim = dim3 ( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() ); + } + + shiftCoordinatesWhileFastestCoordHasSizeOne<T>( firstCoord, gridDim, blockDim ); + + return FieldIndexing<T> ( f, blockDim, gridDim, + FieldAccessor<T>( data, xOffset, yOffset, zOffset, fOffset, firstCoord ) ); +} + + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::xyz ( const GPUField<T> & f ) +{ + CellInterval ci ( 0,0,0, + cell_idx_c( f.xSize() ) - 1, + cell_idx_c( f.ySize() ) - 1, + cell_idx_c( f.zSize() ) - 1 ); + + return interval( f, ci, 0,1 ); +} + + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers ) +{ + cell_idx_t gl = std::min( cell_idx_c(numGhostLayers), cell_idx_c( f.nrOfGhostLayers() ) ); + CellInterval ci ( -gl,-gl,-gl, + cell_idx_c( f.xSize() ) + gl - 1, + cell_idx_c( f.ySize() ) + gl - 1, + cell_idx_c( f.zSize() ) + gl - 1 ); + + return interval( f, ci, 0, 1 ); +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice ); + return interval( f, ci, 0, 1 ); +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice ); + return interval( f, ci, 0, 1 ); +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::allInner ( const GPUField<T> & f ) +{ + CellInterval ci ( 0,0,0, + cell_idx_c( f.xSize() ) - 1, + cell_idx_c( f.ySize() ) - 1, + cell_idx_c( f.zSize() ) - 1 ); + + return interval( f, ci, 0, cell_idx_c( f.fSize() ) ); +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::allWithGhostLayer ( const GPUField<T> & f ) +{ + cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() ); + CellInterval ci ( -gl,-gl,-gl, + cell_idx_c( f.xSize() ) + gl - 1, + cell_idx_c( f.ySize() ) + gl - 1, + cell_idx_c( f.zSize() ) + gl - 1 ); + + return interval( f, ci, 0, cell_idx_c( f.fSize() ) ); + +} + +template< typename T> +FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::CellInterval & ci ) +{ + return interval( f, ci, 0, cell_idx_c( f.fSize() ) ); +} + + + + +GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing ) + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexing.h b/src/cuda/FieldIndexing.h new file mode 100644 index 0000000000000000000000000000000000000000..437b03453d8770d36d37e6b6f7cbe2572b062d0b --- /dev/null +++ b/src/cuda/FieldIndexing.h @@ -0,0 +1,92 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file SimpleFieldIndexing.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block +// +//====================================================================================================================== + +#pragma once + +#include "FieldAccessor.h" + +#include "stencil/Directions.h" +#include <cuda_runtime.h> + +namespace walberla { namespace cell { class CellInterval; } } + +namespace walberla { +namespace cuda { + + // Forward Declarations + template< typename T> class GPUField; + + template<typename T> + class FieldIndexing + { + public: + + //** Kernel call ****************************************************************************************** + /*! \name Kernel call */ + //@{ + uint3 blockDim() const { return blockDim_; } + uint3 gridDim () const { return gridDim_; } + + const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; } + //@} + //**************************************************************************************************************** + + + + + //** Creation ********************************************************************************************* + /*! \name Creation */ + //@{ + static FieldIndexing<T> interval ( const GPUField<T> & f, + const cell::CellInterval & ci, + int fBegin=0, int fEnd=1 ); + + + static FieldIndexing<T> xyz ( const GPUField<T> & f ); + static FieldIndexing<T> withGhostLayerXYZ ( const GPUField<T> & f, uint_t numGhostLayers ); + static FieldIndexing<T> ghostLayerOnlyXYZ ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); + static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); + + static FieldIndexing<T> allInner ( const GPUField<T> & f ); + static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f ); + static FieldIndexing<T> all ( const GPUField<T> & f, const cell::CellInterval & ci ); + //@} + //**************************************************************************************************************** + + protected: + FieldIndexing ( const GPUField<T> & field, + uint3 _blockDim, uint3 _gridDim, + const FieldAccessor<T> _gpuAccess ); + + const GPUField<T> & field_; + uint3 blockDim_; + uint3 gridDim_; + FieldAccessor<T> gpuAccess_; + }; + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexingXYZ.cpp b/src/cuda/FieldIndexingXYZ.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1bb2cbb2ebb6913bb24969e773eb1f3b60c9f0c --- /dev/null +++ b/src/cuda/FieldIndexingXYZ.cpp @@ -0,0 +1,123 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldIndexingXYZ.cpp +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "FieldIndexingXYZ.h" +#include "GPUTypesExplicitInstantiation.h" +#include "GPUField.h" + +#include "core/cell/CellInterval.h" +#include "core/debug/Debug.h" + +namespace walberla { +namespace cuda { + + +template< typename T> +FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field, + uint3 _blockDim, uint3 _gridDim, + const FieldAccessorXYZ<T> _gpuAccess ) + : field_( field ), + blockDim_( _blockDim ), + gridDim_( _gridDim ), + gpuAccess_( _gpuAccess ) +{ + WALBERLA_DEBUG_SECTION() + { + cudaDeviceProp prop; + int count; + cudaGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits<int>::max(); + for (int i = 0; i < count; i++) { + cudaGetDeviceProperties(&prop, i); + threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + } + WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << + "than the maximal thread count per CUDA block." ); + } +} + +template< typename T> +FieldIndexingXYZ<T> FieldIndexingXYZ<T>::interval ( const GPUField<T> & f, const CellInterval & ci ) +{ + size_t xOffset, yOffset, zOffset, fOffset; + + if ( f.layout() == field::zyxf ) + { + fOffset = sizeof(T); + xOffset = f.pitchedPtr().pitch; + yOffset = xOffset * f.xSize(); + zOffset = yOffset * f.ySize(); + } + else + { + xOffset = sizeof(T); + yOffset = f.pitchedPtr().pitch; + zOffset = yOffset * f.ySize(); + fOffset = zOffset * f.zSize(); + } + char * data = (char*)f.pitchedPtr().ptr; + + // Jump over ghost cells to first inner cell + cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() ); + data += ( ci.xMin() + gl )* int_c(xOffset) + + ( ci.yMin() + gl )* int_c(yOffset) + + ( ci.zMin() + gl )* int_c(zOffset); + + dim3 gridDim ( (unsigned int)ci.xSize(), 1, 1 ); + dim3 blockDim( (unsigned int)ci.ySize(), (unsigned int)ci.zSize(), 1 ); + return FieldIndexingXYZ<T> ( f, gridDim, blockDim, + FieldAccessorXYZ<T>( data, xOffset, yOffset, zOffset, fOffset ) ); +} + + +template< typename T> +FieldIndexingXYZ<T> FieldIndexingXYZ<T>::xyz ( const GPUField<T> & f ) +{ + CellInterval ci ( 0,0,0, + cell_idx_c( f.xSize() ) - 1, + cell_idx_c( f.ySize() ) - 1, + cell_idx_c( f.zSize() ) - 1 ); + + return interval( f, ci ); +} + + +template< typename T> +FieldIndexingXYZ<T> FieldIndexingXYZ<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers ) +{ + cell_idx_t gl = std::min( cell_idx_c(numGhostLayers), cell_idx_c( f.nrOfGhostLayers() ) ); + CellInterval ci ( -gl,-gl,-gl, + cell_idx_c( f.xSize() ) + gl - 1, + cell_idx_c( f.ySize() ) + gl - 1, + cell_idx_c( f.zSize() ) + gl - 1 ); + + return interval( f, ci ); +} + + +GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexingXYZ ) + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexingXYZ.h b/src/cuda/FieldIndexingXYZ.h new file mode 100644 index 0000000000000000000000000000000000000000..19d3f98c4a71e09d670324d553834ca56075f6e1 --- /dev/null +++ b/src/cuda/FieldIndexingXYZ.h @@ -0,0 +1,79 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldIndexingXYZ.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "FieldAccessorXYZ.h" +#include <cuda_runtime.h> + + +namespace walberla { namespace cell { class CellInterval; } } + + +namespace walberla { +namespace cuda { + +// Forward Declarations +template< typename T> class GPUField; + + + template<typename T> + class FieldIndexingXYZ + { + public: + + //** Kernel call ****************************************************************************************** + /*! \name Kernel call */ + //@{ + uint3 blockDim() const { return blockDim_; } + uint3 gridDim () const { return gridDim_; } + + const FieldAccessorXYZ<T> & gpuAccess() const { return gpuAccess_; } + //@} + //**************************************************************************************************************** + + + //** Creation ********************************************************************************************* + /*! \name Creation */ + //@{ + + static FieldIndexingXYZ<T> interval ( const GPUField<T> & f, const cell::CellInterval & ci ); + + + static FieldIndexingXYZ<T> xyz ( const GPUField<T> & f ); + static FieldIndexingXYZ<T> withGhostLayerXYZ ( const GPUField<T> & f, uint_t numGhostLayers ); + //@} + //**************************************************************************************************************** + + protected: + FieldIndexingXYZ<T> ( const GPUField<T> & field, uint3 _blockDim, uint3 _gridDim, const FieldAccessorXYZ<T> _gpuAccess ); + + const GPUField<T> & field_; + uint3 blockDim_; + uint3 gridDim_; + FieldAccessorXYZ<T> gpuAccess_; + }; + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/GPUField.cpp b/src/cuda/GPUField.cpp new file mode 100644 index 0000000000000000000000000000000000000000..326e5c0b7e62398a918eb6cd1f8d6bab1d4759b2 --- /dev/null +++ b/src/cuda/GPUField.cpp @@ -0,0 +1,187 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUField.cpp +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "GPUField.h" +#include "ErrorChecking.h" +#include "GPUTypesExplicitInstantiation.h" + +#include "core/logging/Logging.h" + +namespace walberla { +namespace cuda { + + +template<typename T> +GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSize, + uint_t _nrOfGhostLayers, const Layout & _layout ) + : nrOfGhostLayers_( _nrOfGhostLayers ), + xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ), + layout_( _layout ) +{ + cudaExtent extent; + if ( layout_ == zyxf ) + { + extent.width = _fSize * sizeof(T); + extent.height = (_xSize + 2 * _nrOfGhostLayers ); + extent.depth = (_ySize + 2 * _nrOfGhostLayers ) * ( _zSize + 2 * _nrOfGhostLayers ); + } + else + { + extent.width = (_xSize + 2 * _nrOfGhostLayers ) * sizeof(T); + extent.height = (_ySize + 2 * _nrOfGhostLayers ); + extent.depth = (_zSize + 2 * _nrOfGhostLayers ) * _fSize; + } + + WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) ); +} + + +template<typename T> +GPUField<T>::~GPUField() +{ + cudaFree( pitchedPtr_.ptr ); +} + + +template<typename T> +void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci, + cell_idx_t thickness, bool fullSlice ) const +{ + const cell_idx_t sizeArr [] = { cell_idx_c( xSize() ), + cell_idx_c( ySize() ), + cell_idx_c( zSize() )}; + + WALBERLA_ASSERT_GREATER( thickness, 0 ); + WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), nrOfGhostLayers() ); + const cell_idx_t ghosts = cell_idx_c ( thickness ); + + cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( nrOfGhostLayers() ) : 0; + + for(unsigned int dim = 0; dim< 3; ++dim) + switch ( stencil::c[dim][d] ) + { + case -1: ci.min()[dim] = -ghosts; ci.max()[dim] = 0 - 1; break; + case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim]+fullSliceInc - 1; break; + case 1: ci.min()[dim] = sizeArr[dim]; ci.max()[dim] = sizeArr[dim]+ghosts - 1; break; + } +} + + +template<typename T> +void GPUField<T>::getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, + cell_idx_t thickness, bool fullSlice ) const +{ + WALBERLA_ASSERT_GREATER( thickness, 0 ); + + const cell_idx_t sizeArr [] = { cell_idx_c( xSize() ), + cell_idx_c( ySize() ), + cell_idx_c( zSize() )}; + + cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( nrOfGhostLayers() ) : 0; + for(unsigned int dim = 0; dim< 3; ++dim) + switch ( stencil::c[dim][d] ) + { + case -1: ci.min()[dim] = 0; ci.max()[dim] = thickness - 1; break; + case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim] +fullSliceInc- 1; break; + case 1: ci.min()[dim] = sizeArr[dim]-thickness; ci.max()[dim] = sizeArr[dim] - 1; break; + } +} + +//******************************************************************************************************************* +/*! Implementation of equality operator, GPUField are equal if identical +* +* operator== is required in order to store GPUFields as block data. +* Implementing a correct equality check would require to call a kernel, +* which should be done manually. +* Therefore only identical GPUFields are considered equal. +*/ +//******************************************************************************************************************* +template<typename T> +bool GPUField<T>::operator==( const GPUField & o ) const +{ + return pitchedPtr_.ptr == o.pitchedPtr_.ptr && + nrOfGhostLayers_ == o.nrOfGhostLayers_ && + xSize_ == o.xSize_ && + ySize_ == o.ySize_ && + zSize_ == o.zSize_ && + fSize_ == o.fSize_ && + layout_ == o.layout_ ; +} + + + +template<typename T> +uint_t GPUField<T>::xAllocSize() const +{ + if ( layout_ == field::fzyx ) + { + // allocation size is stored in pitched pointer + // pitched pointer stores the amount of padded region in bytes + // but we have to return the size in #elements + WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 ); + return pitchedPtr_.pitch / sizeof(T); + } + return xSize_ + 2 * nrOfGhostLayers_; +} + +template<typename T> +uint_t GPUField<T>::yAllocSize() const +{ + return ySize_ + 2 * nrOfGhostLayers_; +} + +template<typename T> +uint_t GPUField<T>::zAllocSize() const +{ + return zSize_ + 2 * nrOfGhostLayers_; +} + +template<typename T> +uint_t GPUField<T>::fAllocSize() const +{ + if ( layout_ == field::zyxf ) + { + WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 ); + return pitchedPtr_.pitch / sizeof(T); + } + return fSize_; +} + +template<typename T> +void GPUField<T>::swapDataPointers( GPUField<T> & other ) +{ + WALBERLA_ASSERT_EQUAL( xAllocSize(), other.xAllocSize() ); + WALBERLA_ASSERT_EQUAL( yAllocSize(), other.yAllocSize() ); + WALBERLA_ASSERT_EQUAL( zAllocSize(), other.zAllocSize() ); + WALBERLA_ASSERT_EQUAL( fAllocSize(), other.fAllocSize() ); + WALBERLA_ASSERT_EQUAL( layout(), other.layout() ); + std::swap( pitchedPtr_, other.pitchedPtr_ ); +} + + + +GPU_CLASS_TEMPLATE_INSTANTIATION( GPUField ) + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/GPUField.h b/src/cuda/GPUField.h new file mode 100755 index 0000000000000000000000000000000000000000..52cc002b3d98f9cc196b9ddc8d12fc9f9f2c7214 --- /dev/null +++ b/src/cuda/GPUField.h @@ -0,0 +1,122 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUField.h +//! \ingroup moduleName +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/cell/CellInterval.h" +#include "field/Layout.h" +#include "stencil/Directions.h" + +#include <cuda_runtime.h> + + + +namespace walberla { +namespace cuda { + + using field::Layout; + using field::fzyx; + using field::zyxf; + + + //******************************************************************************************************************* + /*! GhostLayerField stored on a CUDA GPU + * + * Basically a wrapper around a CUDA device pointer together with size information about the field + * i.e. sizes in x,y,z,f directions and number of ghost layers. + * + * Internally represented by a cudaPitchedPtr which is allocated with cudaMalloc3D to take padding of the + * innermost coordinate into account. + * + * Supports Array-of-Structures (AoS,zyxf) layout and Structure-of-Arrays (SoA, fzyx) layout, in a similiar way + * to field::Field + * + * To work with the GPUField look at the cuda::fieldCpy functions to transfer a field::Field to a cuda::GPUField + * and vice versa. + * When writing CUDA kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts. + * These simplify the "iteration" i.e. indexing of cells in GPUFields. + */ + //******************************************************************************************************************* + template<typename T> + class GPUField + { + public: + typedef T value_type; + + GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSize, + uint_t _nrOfGhostLayers, const Layout & _layout = zyxf ); + + ~GPUField(); + + Layout layout() const { return layout_; } + + cudaPitchedPtr pitchedPtr() const { return pitchedPtr_; } + + + inline uint_t xSize() const { return xSize_; } + inline uint_t ySize() const { return ySize_; } + inline uint_t zSize() const { return zSize_; } + inline uint_t fSize() const { return fSize_; } + + cell_idx_t xOff() const { return cell_idx_c( nrOfGhostLayers_ ); } + cell_idx_t yOff() const { return cell_idx_c( nrOfGhostLayers_ ); } + cell_idx_t zOff() const { return cell_idx_c( nrOfGhostLayers_ ); } + + + uint_t xAllocSize() const; + uint_t yAllocSize() const; + uint_t zAllocSize() const; + uint_t fAllocSize() const; + + + void swapDataPointers( GPUField<T> & other ); + void swapDataPointers( GPUField<T> * other ) { swapDataPointers( *other); } + + + inline uint_t nrOfGhostLayers() const { return nrOfGhostLayers_; } + + bool operator==( const GPUField & other ) const; + + void getGhostRegion( stencil::Direction d, CellInterval & ci, + cell_idx_t thickness, bool fullSlice ) const; + void getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, + cell_idx_t thickness, bool fullSlice ) const; + + + void * data() { return pitchedPtr_.ptr; } + const void * data() const { return pitchedPtr_.ptr; } + + protected: + cudaPitchedPtr pitchedPtr_; + uint_t nrOfGhostLayers_; + uint_t xSize_; + uint_t ySize_; + uint_t zSize_; + uint_t fSize_; + Layout layout_; + }; + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/GPUTypesExplicitInstantiation.h b/src/cuda/GPUTypesExplicitInstantiation.h new file mode 100644 index 0000000000000000000000000000000000000000..bdc4b58461e748db9133228970498ac5cab422fe --- /dev/null +++ b/src/cuda/GPUTypesExplicitInstantiation.h @@ -0,0 +1,8 @@ +#define GPU_CLASS_TEMPLATE_INSTANTIATION(ClassName)\ + template class ClassName< double >;\ + template class ClassName< float >;\ + template class ClassName< int >;\ + template class ClassName< uint8_t >;\ + template class ClassName< uint16_t >; + + diff --git a/src/cuda/HostFieldAllocator.h b/src/cuda/HostFieldAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..0c08bfadbbc0629dccd307477ddc932590a1e751 --- /dev/null +++ b/src/cuda/HostFieldAllocator.h @@ -0,0 +1,81 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CudaHostFieldAllocator.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +//! \brief Allocator that allocates a CPU! field using cudaHostAlloc +// +//====================================================================================================================== + +#pragma once + +#include "ErrorChecking.h" +#include "field/allocation/FieldAllocator.h" + +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { + + + //******************************************************************************************************************* + /*! + * Allocator that allocates a CPU! field using cudaHostAlloc without padding + * + * Uses cudaHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU + * This allocator should be used for CPU fields that are often transfered to GPU and back + * + * \ingroup cuda + * + */ + //******************************************************************************************************************* + template<typename T, unsigned int cudaHostAllocFlags = cudaHostAllocDefault> + class HostFieldAllocator : public field::FieldAllocator<T> + { + public: + virtual ~HostFieldAllocator() {} + + virtual T * allocateMemory ( uint_t size0, uint_t size1, uint_t size2, uint_t size3, + uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 ) + { + allocSize1=size1; + allocSize2=size2; + allocSize3=size3; + void * result; + WALBERLA_CUDA_CHECK( cudaHostAlloc( &result, size0*size1*size2*size3*sizeof(T), cudaHostAllocFlags ) ); + return (T*)(result); + } + + virtual T * allocateMemory ( uint_t size ) + { + T* result; + cudaHostAlloc( &result, size*sizeof(T), cudaHostAllocFlags ); + return result; + } + + virtual void deallocate(T *& values) { + WALBERLA_CUDA_CHECK( cudaFreeHost( values ) ); + } + }; + + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/Kernel.h b/src/cuda/Kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3b6acf0a1fe2bb7956110236e2b68d07dcb5e4b3 --- /dev/null +++ b/src/cuda/Kernel.h @@ -0,0 +1,294 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Kernel.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "core/Abort.h" +#include "core/debug/Debug.h" + +#include "ErrorChecking.h" + +#include <cuda_runtime.h> +#include <boost/type_traits.hpp> +#include <vector> + + + +namespace walberla { +namespace cuda { + + + //******************************************************************************************************************* + /*! Wrapper class around a CUDA kernel, to call kernels also from code not compiled with nvcc + * + * Example: + * \code + // Declaration of kernel, implementation has to be in a file compiled with nvcc + void kernel_func ( double * inputData, int size ); + + auto kernel = make_kernel( kernel_func ); + kernel.addParam<double*> ( argument1 ); + kernel.addParam<int> ( 20 ); + kernel.configure( dim3( 3,3,3), dim3( 4,4,4) ); + kernel(); + // this code is equivalent to: + kernel_func<<< dim3( 3,3,3), dim3( 4,4,4) >> ( argument1, 20 ); + * \endcode + * + * Why use this strange wrapper class instead of the nice kernel call syntax "<<<griddim, blockdim >>>" ?? + * - This syntax is nice but has to be compiled with nvcc, which does not (yet) understand C++11 + * - C++11 features are used all over the place in waLBerla code + * - all *.cu files and headers included in *.cu files have to be "C++11 free" + * - thus there should be as few code as possible in *.cu files + * + * Drawbacks of this class compared to kernel call syntax: + * Type checking of parameters can only be done at runtime (is done only in Debug mode!). + * Consider the following example: + * \code + // Declaration of kernel, implementation has to be in a file compiled with nvcc + void kernel_func ( double * inputData, int size ); + + auto kernel = make_kernel( kernel_func ); + kernel.addParam<float*> ( argument1 ); + kernel.addParam<unsigned int> ( 40 ); + kernel.configure( dim3( 3,3,3), dim3( 4,4,4) ); + kernel(); + // this code is equivalent to: + kernel_func<<< dim3( 3,3,3), dim3( 4,4,4) >> ( argument1, 20 ); + * \endcode + * The parameter types of the kernel and the parameters added at the cuda::Kernel class do not match. + * This is only detected when the code is run and was compiled in DEBUG mode! + * + * + * Advantages of this class compared to kernel call syntax: Integrates nicely with waLBerlas field indexing and + * accessor concepts: + * \code + void kernel_func( cuda::SimpleFieldAccessor<double> f ); + + auto myKernel = cuda::make_kernel( &kernel_double ); + myKernel.addFieldIndexingParam( cuda::SimpleFieldIndexing<double>::xyz( gpuField ) ); + myKernel(); + * \endcode + * When using at least one FieldIndexingParameter configure() does not have to be called, since the thread and grid + * setup is done by the indexing scheme. If two FieldIndexingParameters are passed, the two indexing schemes have to + * be consistent. + */ + //******************************************************************************************************************* + template<typename FuncPtr> + class Kernel + { + public: + Kernel( FuncPtr funcPtr ); + + template<typename T> void addParam( const T & param ); + template<typename T> void addFieldIndexingParam( const T & indexing ); + + + void configure( dim3 gridDim, dim3 blockDim ); + void operator() () const; + + + protected: + template<typename T> size_t determineNextOffset(); + + + //** Members ********************************************************************************************** + /*! \name Members */ + //@{ + FuncPtr funcPtr_; + + bool configured_; + dim3 gridDim_; + dim3 blockDim_; + + struct ParamInfo { + std::vector<char> data; + size_t offset; + }; + std::vector< ParamInfo > params_; + //@} + //**************************************************************************************************************** + + + //** Type checking of parameters ********************************************************************************** + /*! \name Type checking of parameters */ + //@{ + typedef typename boost::remove_pointer<FuncPtr>::type FuncType; + + #define CHECK_PARAMETER_FUNC( Number ) \ + template<typename T> \ + bool checkParameter##Number( typename boost::enable_if_c< (boost::function_traits<FuncType>::arity >= Number ), T >::type * = 0 ) { \ + return boost::is_same< T, typename boost::function_traits<FuncType>::arg##Number##_type >::value; \ + } \ + template<typename T> \ + bool checkParameter##Number( typename boost::disable_if_c< (boost::function_traits<FuncType>::arity >= Number ),T >::type * = 0 ) { \ + return false; \ + } + + CHECK_PARAMETER_FUNC(1) + CHECK_PARAMETER_FUNC(2) + CHECK_PARAMETER_FUNC(3) + CHECK_PARAMETER_FUNC(4) + CHECK_PARAMETER_FUNC(5) + CHECK_PARAMETER_FUNC(6) + CHECK_PARAMETER_FUNC(7) + CHECK_PARAMETER_FUNC(8) + + template<typename T> bool checkParameter( uint_t n ); + //@} + //**************************************************************************************************************** + }; + + + template<typename FuncPtr> + Kernel<FuncPtr> make_kernel( FuncPtr funcPtr ) { + return Kernel<FuncPtr> ( funcPtr ); + } + + + + + + + + //=================================================================================================================== + // + // Implementation + // + //=================================================================================================================== + + template<typename FP> + Kernel<FP>::Kernel( FP funcPtr ) + : funcPtr_ ( funcPtr ), + configured_( false ) + {} + + template<typename FP> + template<typename T> + void Kernel<FP>::addParam( const T & param ) + { + ParamInfo paramInfo; + paramInfo.data.resize( sizeof(T) ); + std::memcpy ( &(paramInfo.data[0]), ¶m, sizeof(T) ); + paramInfo.offset = determineNextOffset<T>(); + + WALBERLA_ASSERT( checkParameter<T>( params_.size() +1 ), + "cuda::Kernel type mismatch of parameter " << params_.size() +1 ); + + params_.push_back( paramInfo ); + } + + + template<typename FP> + template<typename Indexing> + void Kernel<FP>::addFieldIndexingParam( const Indexing & indexing ) + { + configure( indexing.gridDim(), indexing.blockDim() ); + addParam( indexing.gpuAccess() ); + } + + template<typename FP> + void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim ) + { + if ( ! configured_ ) + { + gridDim_ = gridDim; + blockDim_ = blockDim; + configured_ = true; + } + else + { + if ( gridDim.x != gridDim_.x || gridDim.y != gridDim_.y || gridDim.z != gridDim_.z || + blockDim.x != blockDim_.x || blockDim.y != blockDim_.y || blockDim.z != blockDim_.z ) + { + WALBERLA_ABORT( "Error when configuring cuda::Kernel: Inconsistent setup. " ); + } + } + } + + template<typename FP> + void Kernel<FP>::operator() () const + { + // check for correct number of parameter calls + + if ( params_.size() != boost::function_traits<FuncType>::arity ) { + WALBERLA_ABORT( "Error when calling cuda::Kernel - Wrong number of arguments. " << + "Expected " << boost::function_traits<FuncType>::arity << ", received " << params_.size() ); + } + + // set the number of blocks and threads, + WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams + + // register all parameters + for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt ) { + const void * ptr = &(paramIt->data[0]); + WALBERLA_CUDA_CHECK( cudaSetupArgument( ptr, paramIt->data.size(), paramIt->offset ) ); + } + + // .. and launch the kernel + static_assert( sizeof(void *) == sizeof(void (*)(void)), + "object pointer and function pointer sizes must be equal" ); + // dirty casting trick to circumvent compiler warning + // essentially the next two lines are: cudaLaunch( funcPtr_ ); + void *q = (void*) &funcPtr_; + WALBERLA_CUDA_CHECK( cudaLaunch( (const char*) ( *static_cast<void **>(q) )) ); + } + + + template<typename FP> + template<typename T> + bool Kernel<FP>::checkParameter( uint_t n ) + { + switch (n) { + case 1: return checkParameter1<T>(); + case 2: return checkParameter2<T>(); + case 3: return checkParameter3<T>(); + case 4: return checkParameter4<T>(); + case 5: return checkParameter5<T>(); + case 6: return checkParameter6<T>(); + case 7: return checkParameter7<T>(); + case 8: return checkParameter8<T>(); + default: + WALBERLA_ABORT("Too many parameters passed to kernel"); + } + return false; + } + + + template<typename FP> + template<typename T> + size_t Kernel<FP>::determineNextOffset() + { + size_t currentOffset = 0; + if ( !params_.empty() ) + currentOffset = params_.back().offset + params_.back().data.size(); + + size_t alignment = __alignof( T ); + return (currentOffset + alignment-1) & ~(alignment-1); + } + + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/doc/cuda.dox b/src/cuda/doc/cuda.dox new file mode 100644 index 0000000000000000000000000000000000000000..96652834d67ca2a6caa342855d77981f52d7f214 --- /dev/null +++ b/src/cuda/doc/cuda.dox @@ -0,0 +1,80 @@ + +namespace walberla{ +/*! + +\page cudaPage Overview of waLBerla CUDA support + +\brief waLBerla CUDA concepts + + +\section cudaField Fields on GPU + + +\subsection cudaFieldOverview Creating GPU fields and copy them between host and device + + \code + // create a CPU field and a GPU field of same size and with same layout + GhostLayerField<double,4> h_f ( 16,20,30, 1, 42.0, field::fzyx ); + cuda::GPUField<double> d_f ( 16,20,30, 4, 1, field::fzyx ); + + cuda::fieldCpy( d_f, h_f ); // copy from host to device + some_kernel_wrapper( d_f ); // run some kernel + cuda::fieldCpy( h_f, d_f ); // copy field data back to host + \endcode + + Similarities and Differences of CPU and GPU field + - cuda::GPUField corresponds to field::GhostLayerField + - fSize is a template parameter for CPU fields and a normal parameter for GPUFields + - CPU field iterators correspond to FieldAccessors (see next section) + +\subsection cudaFieldAccess Writing CUDA kernels operating on GPUFields + + \image html cuda/doc/fieldAccess.png "Accessing fields in CUDA kernels" + + When writing a kernel that operates on a field, the first task is to distribute the data to CUDA threads and blocks. + We need a function $(blockIdx, threadIdx) \\rightarrow (x,y,z)$ or $(blockIdx, threadIdx) \\rightarrow (x,y,z,f)$. + The optimal mapping depends on many parameters: for example which layout the field has, the extends of each coordinate, + hardware parameters like warp-size, etc. + Thus this indexing function is abstracted. A few indexing strategies are already implemented which can be + substituted by custom strategies. + A indexing strategy consists of two classes: and somewhat complex Indexing class, which manages the + indexing on the host-side and a lightweight Accessor class, which is passed to the CUDA kernel. + + An indexing scheme is very similar to the iterator concept, it defines the bounds of the iteration, which is not necessarily the + complete field but could also be a certain sub-block, for example the ghost layer in a certain direction. + + + Lets start to write a simple kernel that doubles all values stored in a field: + \code + #include "cuda/FieldAccessor.h" + + __global__ void kernel_double( cuda::FieldAccessor<double> f ) + { + f.set( blockIdx, threadIdx ); + f.get() *= 2.0; + } + \endcode + We do not have to care about indexing, the cuda::FieldAccessor takes care of that. So this is a generic kernel that operates + on double fields. Using the cuda::FieldAccessor the current and neighboring values can be accessed and manipulated. + + This kernel can be called like this: + \code + cuda::FieldIndexing<double> indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true ); + kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() ); + \endcode + In the example above we only iterate over a slice of the field. Of course we can also iterate over the complete field, there are + various static member functions in a Indexing class to create certain iteration patterns. + The Indexing class encapsulates the information of how to launch the kernel (blockDim and gridDim) and holds the Accessor class that + is passed to the kernel. + + Two indexing strategies are currently provided: + - cuda::FieldIndexing and cuda::FieldAccessor (general, but slow ) + - cuda::FieldIndexingXYZ and cuda::FieldAccessorXYZ ( optimized for cell based iterating over bigger chunks, for fields where xSize bigger than warpSize ) + + \section cudaKernelWrapper Calling CUDA kernels from CPP files + \copydoc cuda::Kernel + + + +*/ +} diff --git a/src/cuda/doc/drawing.svg b/src/cuda/doc/drawing.svg new file mode 100644 index 0000000000000000000000000000000000000000..4e356d3f301c16035e3c87dbbb7674d6af2459e6 --- /dev/null +++ b/src/cuda/doc/drawing.svg @@ -0,0 +1,285 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="744.09448819" + height="1052.3622047" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="drawing.svg"> + <defs + id="defs4"> + <marker + inkscape:stockid="Arrow1Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Lend" + style="overflow:visible;"> + <path + id="path3859" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.8) rotate(180) translate(12.5,0)" /> + </marker> + <marker + inkscape:stockid="Arrow1Lstart" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Lstart" + style="overflow:visible"> + <path + id="path3856" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt" + transform="scale(0.8) translate(12.5,0)" /> + </marker> + <marker + inkscape:stockid="Arrow1Lend" + orient="auto" + refY="0" + refX="0" + id="Arrow1Lend-7" + style="overflow:visible"> + <path + inkscape:connector-curvature="0" + id="path3859-2" + d="M 0,0 5,-5 -12.5,0 5,5 0,0 z" + style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt" + transform="matrix(-0.8,0,0,-0.8,-10,0)" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="1.4" + inkscape:cx="310.32854" + inkscape:cy="651.81828" + inkscape:document-units="px" + inkscape:current-layer="layer1" + showgrid="true" + inkscape:window-width="1600" + inkscape:window-height="1180" + inkscape:window-x="2558" + inkscape:window-y="-3" + inkscape:window-maximized="1"> + <inkscape:grid + type="xygrid" + id="grid2987" /> + </sodipodi:namedview> + <metadata + id="metadata7"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1"> + <rect + style="fill:#88bfff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + id="rect3757" + width="240" + height="60" + x="40" + y="53.076469" + ry="10" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="50" + y="92.362183" + id="text3759" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3761" + x="50" + y="92.362183" + style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">GhostLayerField</tspan></text> + <rect + style="fill:#88bfff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + id="rect3757-9" + width="240" + height="60" + x="40" + y="182.36218" + ry="10" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="50" + y="222.36218" + id="text3759-2" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3761-6" + x="50" + y="222.36218" + style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">cuda::GPUField</tspan></text> + <rect + style="fill:#c7ffea;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.71999997px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + id="rect3757-0" + width="166.62854" + height="40.114277" + x="277.37146" + y="126.6479" + ry="7.1999998" /> + <text + xml:space="preserve" + style="font-size:28.79999924px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="300" + y="152.36218" + id="text3759-7" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3761-1" + x="300" + y="152.36218" + style="font-size:17.27999878px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">FieldCopy.h</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow1Lstart);marker-end:url(#Arrow1Lend)" + d="m 150,182.36218 0,-70" + id="path3850" + inkscape:connector-curvature="0" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="160" + y="152.36218" + id="text4482" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4484" + x="160" + y="152.36218" + style="font-size:11px">cpu-gpu transfer</tspan></text> + <rect + style="fill:#ffe693;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + id="rect3757-9-2" + width="240" + height="60" + x="42.142857" + y="293.07648" + ry="10" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="90" + y="332.36218" + id="text3759-2-2" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3761-6-3" + x="90" + y="332.36218" + style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">*Indexing</tspan></text> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="293.57144" + y="316.64789" + id="text4482-4" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4484-2" + x="293.57144" + y="316.64789" + style="font-size:11px">represents iteration over a field </tspan><tspan + sodipodi:role="line" + x="293.57144" + y="330.39789" + style="font-size:11px" + id="tspan4550"> i.e. complete field, only slice, only ghost layer..</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow1Lend);stroke-miterlimit:4;stroke-dasharray:1,1;stroke-dashoffset:0" + d="m 150,292.36218 0,-50" + id="path4552" + inkscape:connector-curvature="0" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="81.428574" + y="275.21933" + id="text4482-6" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4484-26" + x="81.428574" + y="275.21933" + style="font-size:11px"><<< can be created with >>></tspan></text> + <rect + style="fill:#ffe693;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + id="rect3757-9-2-7" + width="240" + height="60" + x="42.14286" + y="403.07648" + ry="10" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="90" + y="442.36218" + id="text3759-2-2-5" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3761-6-3-3" + x="90" + y="442.36218" + style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">*Accessor</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1, 1;stroke-dashoffset:0;marker-end:url(#Arrow1Lend)" + d="m 150,402.36218 0,-50" + id="path4552-9" + inkscape:connector-curvature="0" /> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="81.428574" + y="385.21933" + id="text4482-6-8" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4484-26-6" + x="81.428574" + y="385.21933" + style="font-size:11px"><<< can be created with >>></tspan></text> + <text + xml:space="preserve" + style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" + x="290" + y="432.36218" + id="text4482-4-0" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + x="290" + y="432.36218" + style="font-size:11px" + id="tspan4550-2">class to be passed to GPU kernel</tspan><tspan + sodipodi:role="line" + x="290" + y="446.11218" + style="font-size:11px" + id="tspan5222">can access a single value or cell , supports neighbor access</tspan></text> + </g> +</svg> diff --git a/src/cuda/doc/fieldAccess.png b/src/cuda/doc/fieldAccess.png new file mode 100644 index 0000000000000000000000000000000000000000..d6dc372b737cea71bb0e4128f11a8bf72edafab3 Binary files /dev/null and b/src/cuda/doc/fieldAccess.png differ diff --git a/src/cuda/ideasForCommunication.txt b/src/cuda/ideasForCommunication.txt new file mode 100644 index 0000000000000000000000000000000000000000..44f43cc5c616e74fd9b4b283e8457dce49c5760b --- /dev/null +++ b/src/cuda/ideasForCommunication.txt @@ -0,0 +1,40 @@ + +HybridUniformNeighborScheme scheme ( blockstorage, blockSelectorForGPU); + +# Add field that is communicated +scheme.addPackInfo( FieldPackInfo(cpuFieldID), GPUFieldPackInfo(gpuFieldID) ); +# possibly add further fields which are sent in same message +scheme.addPackInfo( .... ) + + +GPUFieldPackInfo Interface + + MemberVariable: the GPUField + - pack ( direction, device_ptr, offset ) # -> calls packing kernel that copies the slice before ghost layer to the device_ptr buffer + - getConstantByteSizeForDirection( d ) # returns the required buffer size to pack for a neighbor in the given direction + - unpack (direction, device_ptr, offset ) + + +# Pseudo code for scheme.startCommunication +startCommunication: + 1) Pack everything into buffers + For all blocks + For all neighbors of current block + if currentBlock=gpu: + if neighbor is localcpu + -> call localToCPU + if neighbor is localgpu + -> + if neighbor is somewhere else + -> call pack on gpu + else currentBlock=cpu: + similar to above + + +How to send on GPU? + - use blocking send on GPU and different streams for all blocks + - use ISend ... how to manage the exchange of MPI_Request and the wait? + + + + + diff --git a/src/field/Field.h b/src/field/Field.h index 38c402dc500ffadcf4b54094693e6457c6e7c8b2..c7785e80ef0006e8cb2703751c2f6f8870579dfe 100644 --- a/src/field/Field.h +++ b/src/field/Field.h @@ -181,17 +181,6 @@ namespace field { //**************************************************************************************************************** - //** Low Level Functions ********************************************************************************** - /*! \name Low Level Functions - only use when absolutely necessary */ - //@{ - T * data(); - const T * data() const; - - shared_ptr< FieldAllocator<T> > getAllocator() const; - //@} - //**************************************************************************************************************** - - //** Equality Checks ********************************************************************************************* /*! \name Equality Checks */ //@{ @@ -297,6 +286,17 @@ namespace field { //**************************************************************************************************************** + //** Pointer to internal memory - use with care! ********************************************************** + /*! \name Pointer to internal memory - use with care! */ + //@{ + T * data() { return values_; } + const T * data() const { return values_; } + T * dataInner() { return valuesWithOffset_; } + const T * dataInner() const { return valuesWithOffset_; } + + shared_ptr< FieldAllocator<T> > getAllocator() const; + //@} + //**************************************************************************************************************** //** Static cached end iterators ********************************************************************************* diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h index 2feeefaf0723ca6484179fad4ac08fb6260ce662..39b9b227d76ddb481ceed93bc32feebdef124cf4 100644 --- a/src/field/Field.impl.h +++ b/src/field/Field.impl.h @@ -1136,32 +1136,6 @@ namespace field { //=================================================================================================================== - //******************************************************************************************************************* - /*! Returns a pointer to the memory region where field data is stored - * - * For additional information about memory layout, have a look at the xOff() and xStride() functions - * If you store a pointer to the internal field data, you have to make sure the data is not freed by the field class. - * See documentation of getAllocator() for this - */ - //******************************************************************************************************************* - template<typename T, uint_t fSize_ > - T * Field<T,fSize_>::data() - { - return values_; - } - - //******************************************************************************************************************* - /*! Returns a pointer to the memory region where field data is stored - * - * see documentation of nonconst version - */ - //******************************************************************************************************************* - template<typename T, uint_t fSize_ > - const T * Field<T,fSize_>::data() const - { - return values_; - } - //******************************************************************************************************************* /*! Returns internal data allocator * diff --git a/src/field/Layout.h b/src/field/Layout.h new file mode 100644 index 0000000000000000000000000000000000000000..ed78ba0e9d00b9e82f64766006aa9fcbff486c5a --- /dev/null +++ b/src/field/Layout.h @@ -0,0 +1,43 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Layout.h +//! \ingroup field +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + + +namespace walberla { +namespace field { + + + /** + * \brief Layout for field ( + * \ingroup field + */ + enum Layout { + fzyx = 0, //!< Value-sorted data layout (f should be outermost loop) + zyxf = 1 //!< Cell-sorted data layout, (f should be innermost loop) + }; + + + +} // namespace field +} // namespace walberla + + diff --git a/src/field/communication/MPIDatatypes.h b/src/field/communication/MPIDatatypes.h index 0aaff22d46ccdb2e651ea3c7d84ba347ca55c473..3e1fe5f88a78169a2b2cc2a3c48d00861faeaffb 100644 --- a/src/field/communication/MPIDatatypes.h +++ b/src/field/communication/MPIDatatypes.h @@ -65,8 +65,8 @@ namespace communication { * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatype( const Field< T, fSize > & field ); +template<typename Field_T> +MPI_Datatype mpiDatatype( const Field_T & field ); @@ -92,8 +92,8 @@ MPI_Datatype mpiDatatype( const Field< T, fSize > & field ); * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSlice( const Field< T, fSize > & field, +template<typename Field_T> +MPI_Datatype mpiDatatypeSlice( const Field_T & field, const cell_idx_t xBeg, const cell_idx_t yBeg, const cell_idx_t zBeg, const cell_idx_t fBeg, const cell_idx_t xEnd, const cell_idx_t yEnd, const cell_idx_t zEnd, const cell_idx_t fEnd ); @@ -115,8 +115,8 @@ MPI_Datatype mpiDatatypeSlice( const Field< T, fSize > & field, * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, cell_idx_t f = 0 ); +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, cell_idx_t f = 0 ); @@ -137,8 +137,8 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, const cell_idx_t fBeg, const cell_idx_t fEnd ); +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, const cell_idx_t fBeg, const cell_idx_t fEnd ); @@ -158,8 +158,8 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, const std::set<cell_idx_t> & fs ); +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, const std::set<cell_idx_t> & fs ); @@ -176,8 +176,8 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & field ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField_T & field ); @@ -196,8 +196,8 @@ MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & fiel * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & field, const uint_t numGhostLayers ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField_T & field, const uint_t numGhostLayers ); @@ -217,8 +217,8 @@ MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & fiel * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice = false ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice = false ); @@ -238,8 +238,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & fiel * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & field, const uint_t thickness, const stencil::Direction dir, const bool fullSlice = false ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField_T & field, const uint_t thickness, const stencil::Direction dir, const bool fullSlice = false ); @@ -259,8 +259,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & fiel * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice = false, const cell_idx_t f = 0 ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice = false, const cell_idx_t f = 0 ); //====================================================================================================================== @@ -280,8 +280,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice, const cell_idx_t fBeg, const cell_idx_t fEnd ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice, const cell_idx_t fBeg, const cell_idx_t fEnd ); //====================================================================================================================== @@ -300,8 +300,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice, const std::set<cell_idx_t> & fs ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice, const std::set<cell_idx_t> & fs ); //====================================================================================================================== @@ -320,8 +320,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness = 1, const bool fullSlice = false ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness = 1, const bool fullSlice = false ); //====================================================================================================================== @@ -341,8 +341,8 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField< T, fSize > * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness = 1, const cell_idx_t f = 0, const bool fullSlice = false ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness = 1, const cell_idx_t f = 0, const bool fullSlice = false ); //====================================================================================================================== @@ -363,8 +363,8 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSiz * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness, const cell_idx_t fBeg, const cell_idx_t fEnd, const bool fullSlice ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness, const cell_idx_t fBeg, const cell_idx_t fEnd, const bool fullSlice ); //====================================================================================================================== @@ -384,12 +384,12 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSiz * \returns The MPI datatype */ //====================================================================================================================== -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness, const std::set<cell_idx_t> & fs, const bool fullSlice ); +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness, const std::set<cell_idx_t> & fs, const bool fullSlice ); } // namespace communication } // namespace field } // namespace walberla -#include "MPIDatatypes.impl.h" \ No newline at end of file +#include "MPIDatatypes.impl.h" diff --git a/src/field/communication/MPIDatatypes.impl.h b/src/field/communication/MPIDatatypes.impl.h index 3a801a6717b01b8075ae5d42867fd2aeef329cc9..1bb4bf3a6bf5c8ff35d335dc20d2365c04de931e 100644 --- a/src/field/communication/MPIDatatypes.impl.h +++ b/src/field/communication/MPIDatatypes.impl.h @@ -28,11 +28,12 @@ namespace field { namespace communication { -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSlice( const Field< T, fSize > & field, +template<typename Field_T> +MPI_Datatype mpiDatatypeSlice( const Field_T & field, const cell_idx_t xBeg, const cell_idx_t yBeg, const cell_idx_t zBeg, const cell_idx_t fBeg, const cell_idx_t xEnd, const cell_idx_t yEnd, const cell_idx_t zEnd, const cell_idx_t fEnd ) { + typedef typename Field_T::value_type T; int sizes[4]; int subsizes[4]; int starts[4]; @@ -101,8 +102,8 @@ MPI_Datatype mpiDatatypeSlice( const Field< T, fSize > & field, -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatype( const Field< T, fSize > & field ) +template<typename Field_T> +MPI_Datatype mpiDatatype( const Field_T & field ) { return mpiDatatypeSlice( field, cell_idx_t( 0 ), cell_idx_t( 0 ), cell_idx_t( 0 ), cell_idx_t( 0 ), @@ -111,8 +112,8 @@ MPI_Datatype mpiDatatype( const Field< T, fSize > & field ) } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, cell_idx_t f /*= 0*/ ) +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, cell_idx_t f /*= 0*/ ) { return mpiDatatypeSlice( field, interval.xMin(), interval.yMin(), interval.zMin(), f, @@ -120,8 +121,8 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, const cell_idx_t fBeg, const cell_idx_t fEnd ) +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, const cell_idx_t fBeg, const cell_idx_t fEnd ) { return mpiDatatypeSlice( field, interval.xMin(), interval.yMin(), interval.zMin(), fBeg, @@ -129,9 +130,11 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInterval & interval, const std::set<cell_idx_t> & fs ) +template<typename Field_T> +MPI_Datatype mpiDatatypeSliceXYZ( const Field_T & field, const CellInterval & interval, const std::set<cell_idx_t> & fs ) { + typedef typename Field_T::value_type T; + MPI_Datatype newType = MPI_DATATYPE_NULL; int sizes[3]; @@ -206,14 +209,14 @@ MPI_Datatype mpiDatatypeSliceXYZ( const Field< T, fSize > & field, const CellInt } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & field ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField_T & field ) { return mpiDatatypeWithGhostLayer( field, field.nrOfGhostLayers() ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & field, const uint_t numGhostLayers ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField_T & field, const uint_t numGhostLayers ) { const cell_idx_t xBeg = - cell_idx_c( numGhostLayers ); const cell_idx_t yBeg = - cell_idx_c( numGhostLayers ); @@ -231,14 +234,14 @@ MPI_Datatype mpiDatatypeWithGhostLayer( const GhostLayerField< T, fSize > & fiel } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice /*= false*/ ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice /*= false*/ ) { return mpiDatatypeGhostLayerOnly( field, field.nrOfGhostLayers(), dir, fullSlice ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & field, const uint_t thickness, const stencil::Direction dir, const bool fullSlice /*= false*/ ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField_T & field, const uint_t thickness, const stencil::Direction dir, const bool fullSlice /*= false*/ ) { CellInterval ci; field.getGhostRegion( dir, ci, cell_idx_c( thickness ), fullSlice ); @@ -250,8 +253,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnly( const GhostLayerField< T, fSize > & fiel } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice /*= false*/, const cell_idx_t f /*= 0*/ ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice /*= false*/, const cell_idx_t f /*= 0*/ ) { CellInterval ci; field.getGhostRegion( dir, ci, cell_idx_c( field.nrOfGhostLayers() ), fullSlice ); @@ -259,8 +262,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f return mpiDatatypeSliceXYZ( field, ci, f ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice, const cell_idx_t fBeg, const cell_idx_t fEnd ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice, const cell_idx_t fBeg, const cell_idx_t fEnd ) { CellInterval ci; field.getGhostRegion( dir, ci, cell_idx_c( field.nrOfGhostLayers() ), fullSlice ); @@ -268,8 +271,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f return mpiDatatypeSliceXYZ( field, ci, fBeg, fEnd ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const bool fullSlice, const std::set<cell_idx_t> & fs ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const bool fullSlice, const std::set<cell_idx_t> & fs ) { CellInterval ci; field.getGhostRegion( dir, ci, cell_idx_c( field.nrOfGhostLayers() ), fullSlice ); @@ -277,8 +280,8 @@ MPI_Datatype mpiDatatypeGhostLayerOnlyXYZ( const GhostLayerField< T, fSize > & f return mpiDatatypeSliceXYZ( field, ci, fs ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness /*= 1*/, const bool fullSlice /*= false*/ ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness /*= 1*/, const bool fullSlice /*= false*/ ) { CellInterval ci; field.getSliceBeforeGhostLayer( dir, ci, cell_idx_c( thickness ), fullSlice ); @@ -289,8 +292,8 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayer( const GhostLayerField< T, fSize > return mpiDatatypeSliceXYZ( field, ci, fBeg, fEnd ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness /*= 1*/, const cell_idx_t f /*= 0*/, const bool fullSlice /*= false*/ ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness /*= 1*/, const cell_idx_t f /*= 0*/, const bool fullSlice /*= false*/ ) { CellInterval ci; field.getSliceBeforeGhostLayer( dir, ci, cell_idx_c( thickness ), fullSlice ); @@ -298,8 +301,8 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSiz return mpiDatatypeSliceXYZ( field, ci, f ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness, const cell_idx_t fBeg, const cell_idx_t fEnd, const bool fullSlice ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness, const cell_idx_t fBeg, const cell_idx_t fEnd, const bool fullSlice ) { CellInterval ci; field.getSliceBeforeGhostLayer( dir, ci, cell_idx_c( thickness ), fullSlice ); @@ -307,8 +310,8 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSiz return mpiDatatypeSliceXYZ( field, ci, fBeg, fEnd ); } -template<typename T, uint_t fSize> -MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSize > & field, const stencil::Direction dir, const uint_t thickness, const std::set<cell_idx_t> & fs, const bool fullSlice ) +template<typename GhostLayerField_T> +MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField_T & field, const stencil::Direction dir, const uint_t thickness, const std::set<cell_idx_t> & fs, const bool fullSlice ) { CellInterval ci; field.getSliceBeforeGhostLayer( dir, ci, cell_idx_c( thickness ), fullSlice ); @@ -319,4 +322,4 @@ MPI_Datatype mpiDatatypeSliceBeforeGhostlayerXYZ( const GhostLayerField< T, fSiz } // namespace communication } // namespace field -} // namespace walberla \ No newline at end of file +} // namespace walberla diff --git a/src/field/iterators/FieldIterator.h b/src/field/iterators/FieldIterator.h index 6a831fb54deda9f200218b039162cb61d0f18bee..a1c328e7ad0195e1f839cbbd0cb9435ffdae56cd 100644 --- a/src/field/iterators/FieldIterator.h +++ b/src/field/iterators/FieldIterator.h @@ -26,6 +26,7 @@ #include "core/DataTypes.h" #include "core/cell/Cell.h" #include "core/debug/Debug.h" +#include "field/Layout.h" #include "stencil/Directions.h" @@ -39,16 +40,6 @@ namespace walberla { namespace field { - /** - * \brief Layout for field ( - * \ingroup field - */ - enum Layout { - fzyx = 0, //!< Value-sorted data layout (f should be outermost loop) - zyxf = 1 //!< Cell-sorted data layout, (f should be innermost loop) - }; - - template<typename T, uint_t fSize_> class Field; // forward for friend declaration diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 00c3bf12b7a8ea299197b10f1b50186383814ed2..55b690d6c46f9f3d5423bff092bb74b477f3b90e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,6 +5,7 @@ include_directories( ${walberla_BINARY_DIR}/src ) # for generated headers add_subdirectory( blockforest ) add_subdirectory( boundary ) add_subdirectory( core ) +add_subdirectory( cuda ) add_subdirectory( domain_decomposition ) add_subdirectory( fft ) add_subdirectory( field ) diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..89d914803497c7f907f7338eeea19efaba01769f --- /dev/null +++ b/tests/cuda/CMakeLists.txt @@ -0,0 +1,15 @@ +################################################################################################### +# +# Tests for cuda +# +################################################################################################### + +waLBerla_compile_test( FILES FieldTransferTest ) +waLBerla_execute_test( NAME FieldTransferTest ) + +waLBerla_compile_test( FILES SimpleKernelTest.cpp Kernels.cu DEPENDS blockforest timeloop gui ) +waLBerla_execute_test( NAME SimpleKernelTest ) + +waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui ) +waLBerla_execute_test( NAME CudaMPI ) + diff --git a/tests/cuda/CudaMPI.cpp b/tests/cuda/CudaMPI.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ffcb892601289e954d96f59899d26837a5df7f86 --- /dev/null +++ b/tests/cuda/CudaMPI.cpp @@ -0,0 +1,144 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CudaMPI.h +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#include "blockforest/Initialization.h" + + +#include "core/debug/TestSubsystem.h" +#include "core/Environment.h" +#include "core/logging/Logging.h" +#include "core/mpi/Datatype.h" + +#include "cuda/GPUField.h" + +#include "field/communication/MPIDatatypes.h" +#include "field/AddToStorage.h" +#include "timeloop/SweepTimeloop.h" + +#include "gui/Gui.h" + + +using namespace walberla; + + +void fullFieldTransfer() +{ + Field<double,4> h_f1 ( 3, 4, 2, 42.0, field::fzyx ); + Field<double,4> h_f2 ( 3, 4, 2, 27.0, field::fzyx ); + + cuda::GPUField<double> d_f ( 3, 4, 2, 4, 8.0, field::fzyx ); + + + // Transfer h_f1 from CPU to GPU d_f + + auto h_f1_datatype = mpi::Datatype ( field::communication::mpiDatatype( h_f1 ) ); + auto h_f2_datatype = mpi::Datatype ( field::communication::mpiDatatype( h_f2 ) ); + auto d_f_datatype = mpi::Datatype ( field::communication::mpiDatatype( d_f ) ); + + WALBERLA_LOG_DEVEL("ISend"); + MPI_Request request1; + MPI_Isend( h_f1.data(), 1, h_f1_datatype, 0, 0, MPI_COMM_WORLD, &request1 ); + + WALBERLA_LOG_DEVEL("IRecv"); + MPI_Request request2; + MPI_Irecv( d_f.data(), 1, d_f_datatype, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + + // Transfer GPU field d_f back to CPU into h_f2 + + MPI_Request request3; + WALBERLA_LOG_DEVEL("ISend"); + MPI_Isend( d_f.data(), 1, d_f_datatype, 0, 0, MPI_COMM_WORLD , &request3 ); + + MPI_Request request4; + WALBERLA_LOG_DEVEL("IRecv"); + MPI_Irecv( h_f2.data(), 1, h_f2_datatype, 0, 0, MPI_COMM_WORLD, &request4 ); + + MPI_Wait( &request3, MPI_STATUS_IGNORE ); + MPI_Wait( &request4, MPI_STATUS_IGNORE ); + + WALBERLA_CHECK_EQUAL( h_f1, h_f2 ); +} + + +void blockStorageAndGui( int argc, char ** argv ) +{ + shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid( + uint_c(1) , uint_c(1), uint_c(1), // number of blocks in x,y,z direction + uint_c(5) , uint_c(7), uint_c(3), // number of blocks in x,y,z direction + real_c(1), // dx: length of one cell in physical coordinates + false, // one block per process? - "false" means all blocks to one process + true, true, true ); // no periodicity + + typedef GhostLayerField<real_t,1> ScalarField; + BlockDataID cpuFieldID1 = field::addToStorage<ScalarField>( blocks, "CPUField 1", real_c(42), field::fzyx, uint_c(1) ); + BlockDataID cpuFieldID2 = field::addToStorage<ScalarField>( blocks, "CPUField 2", real_c(0), field::fzyx, uint_c(1) ); + + typedef cuda::GPUField<real_t> GPUField; + BlockDataID gpuFieldID = blocks->addStructuredBlockData< GPUField >( + [&] ( IBlock * block, StructuredBlockStorage * const s ) { + return new GPUField( s->getNumberOfXCells(*block), + s->getNumberOfYCells(*block), + s->getNumberOfZCells(*block), + 1 , 1.0); + }, + "GPU Field" ); + + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + { + // get the field stored on the current block + ScalarField * h_f1 = blockIt->getData<ScalarField>( cpuFieldID1 ); + ScalarField * h_f2 = blockIt->getData<ScalarField>( cpuFieldID2 ); + GPUField * d_f = blockIt->getData<GPUField> ( gpuFieldID ); + + auto h_f1_datatype = mpi::Datatype ( field::communication::mpiDatatypeSliceBeforeGhostlayer( *h_f1, stencil::W, 1, true ) ); + auto h_f2_datatype = mpi::Datatype ( field::communication::mpiDatatypeSliceBeforeGhostlayer( *h_f2, stencil::W, 1, true ) ); + auto d_f_datatype = mpi::Datatype ( field::communication::mpiDatatypeSliceBeforeGhostlayer( *d_f , stencil::W, 1, true ) ); + + MPI_Sendrecv( const_cast<double *>( h_f1->data() ), 1, h_f1_datatype, 0, 0, + d_f->data(), 1, d_f_datatype , 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); + + MPI_Sendrecv( d_f->data(), 1, d_f_datatype, 0, 0, + h_f2->data(), 1, h_f2_datatype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); + + + } + + SweepTimeloop timeloop( blocks, 4 ); + GUI gui( timeloop, blocks, argc, argv ); + gui.run(); + +} + + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + walberla::Environment walberlaEnv( argc, argv ); + + fullFieldTransfer(); + //blockStorageAndGui(argc, argv); + + + return 0; +} diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/cuda/FieldTransferTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e37768d9d2d1dd5a2fd188d057bc493f514d1f7a --- /dev/null +++ b/tests/cuda/FieldTransferTest.cpp @@ -0,0 +1,65 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldTransferTest.h +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#include "core/debug/TestSubsystem.h" +#include "core/Environment.h" + +#include "field/Field.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" + + +using namespace walberla; + +void simpleTransfer() +{ + Field<double,4> h_f1 ( 16, 20, 30, 42.0, field::fzyx ); + Field<double,4> h_f2 ( 16, 20, 30, 0.0, field::fzyx ); + + + cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx ); + + WALBERLA_CHECK_EQUAL( h_f1.xSize() ,d_f.xSize() ); + WALBERLA_CHECK_EQUAL( h_f1.ySize() ,d_f.ySize() ); + WALBERLA_CHECK_EQUAL( h_f1.zSize() ,d_f.zSize() ); + WALBERLA_CHECK_EQUAL( h_f1.fSize() ,d_f.fSize() ); + WALBERLA_CHECK_EQUAL( h_f1.layout() ,d_f.layout() ); + + + cuda::fieldCpy( d_f, h_f1 ); + cuda::fieldCpy( h_f2, d_f ); + + WALBERLA_CHECK_EQUAL( h_f1, h_f2 ); +} + + + + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + walberla::Environment walberlaEnv( argc, argv ); + + simpleTransfer(); + + return 0; +} diff --git a/tests/cuda/Kernels.cu b/tests/cuda/Kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb422811365c6c46e0a426df2a0876f10eb95062 --- /dev/null +++ b/tests/cuda/Kernels.cu @@ -0,0 +1,33 @@ + +#include <iostream> + +#include "cuda/FieldAccessor.h" +#include "cuda/FieldIndexing.h" + +namespace walberla { + + +namespace cuda { + template<typename T> + class GPUField; +} + +__global__ void kernel_double( cuda::FieldAccessor<double> f ) +{ + f.set( blockIdx, threadIdx ); + f.get() *= 2.0; +} + +void kernel_double_field( const cuda::GPUField<double> & field ) +{ + using namespace std; + cuda::FieldIndexing<double> iter = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true ); + std::cout << "Kernel call dims " + << iter.blockDim().x << "," + << iter.gridDim().x << "," + << iter.gridDim().y << "," + << iter.gridDim().z << endl; + kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() ); +} + +} // namespace walberla diff --git a/tests/cuda/SimpleKernelTest.cpp b/tests/cuda/SimpleKernelTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8313947ed6c672a2aa3c92b641b37b8eced9515b --- /dev/null +++ b/tests/cuda/SimpleKernelTest.cpp @@ -0,0 +1,115 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldTransferTest.h +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "cuda/FieldIndexing.h" +#include "blockforest/Initialization.h" + +#include "core/debug/TestSubsystem.h" +#include "core/Environment.h" + +#include "field/GhostLayerField.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" +#include "cuda/Kernel.h" +#include "gui/Gui.h" +#include "timeloop/SweepTimeloop.h" + +using namespace walberla; + +namespace walberla{ +void kernel_double_field( const cuda::GPUField<double> & field ); + +void kernel_double( cuda::FieldAccessor<double> f ); +} + + + +GhostLayerField<real_t,1> * createCPUField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new GhostLayerField<real_t,1> ( + storage->getNumberOfXCells( *block ), // number of cells in x direction + storage->getNumberOfYCells( *block ), // number of cells in y direction + storage->getNumberOfZCells( *block ), // number of cells in z direction + 1, // number of ghost layers + real_t(1), // initial value + field::fzyx); +} + +cuda::GPUField<real_t> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new cuda::GPUField<real_t> ( + storage->getNumberOfXCells( *block ), // number of cells in x direction + storage->getNumberOfYCells( *block ), // number of cells in y direction + storage->getNumberOfZCells( *block ), // number of cells in z direction + 1, // fSize + 1, // number of ghost layers + field::fzyx ); +} + + +int main( int argc, char ** argv ) +{ + walberla::Environment env( argc, argv ); + debug::enterTestMode(); + + shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid ( + uint_t(1), uint_t(1), uint_t(1), // number of blocks in x,y,z direction + uint_t(14), uint_t(14), uint_t(14), // how many cells per block (x,y,z) + real_c(0.5), // dx: length of one cell in physical coordinates + false, // one block per process - "false" means all blocks to one process + false, false, false ); // no periodicity + + + + BlockDataID cpuFieldID = blocks->addStructuredBlockData< GhostLayerField<real_t,1> > ( &createCPUField, "CPUField" ); + + + BlockDataID gpuFieldID = blocks->addStructuredBlockData< cuda::GPUField<real_t> > ( &createGPUField, "GPUField" ); + + for ( auto blockIterator = blocks->begin(); blockIterator != blocks->end(); ++blockIterator ) + { + IBlock & currentBlock = *blockIterator; + + // get the field stored on the current block + auto cpuField = currentBlock.getData< GhostLayerField<real_t,1> > ( cpuFieldID ); + auto gpuField = currentBlock.getData< cuda::GPUField<real_t> > ( gpuFieldID ); + + cuda::fieldCpy( *gpuField, *cpuField ); + + auto myKernel = cuda::make_kernel( &kernel_double ); + auto indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( *gpuField, 1, stencil::W, true ); + myKernel.addFieldIndexingParam(indexing); + myKernel(); + + cuda::fieldCpy( *cpuField, *gpuField ); + + WALBERLA_ASSERT_EQUAL( cpuField->get(0,0,0), 2 ); + } + + + //SweepTimeloop timeloop ( blocks, uint_t(1) ); + //timeloop.run(); + //GUI gui ( timeloop, blocks, argc, argv ); + //gui.run(); + + + return 0; +}