Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (2)
Showing
with 1628 additions and 4 deletions
...@@ -81,6 +81,8 @@ option ( WALBERLA_BUILD_WITH_PYTHON_LBM "Include LBM module into python modu ...@@ -81,6 +81,8 @@ option ( WALBERLA_BUILD_WITH_PYTHON_LBM "Include LBM module into python modu
option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr" ) option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr" )
option ( WALBERLA_BUILD_WITH_CUDA "Enable CUDA support" )
option ( WALBERLA_BUILD_WITH_FASTMATH "Fast math" ) option ( WALBERLA_BUILD_WITH_FASTMATH "Fast math" )
...@@ -993,6 +995,45 @@ endif() ...@@ -993,6 +995,45 @@ endif()
############################################################################################################################
##
## CUDA
##
############################################################################################################################
if ( WALBERLA_BUILD_WITH_CUDA )
# set ( BUILD_SHARED_LIBS ON )
set ( CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON )
set ( CUDA_PROPAGATE_HOST_FLAGS OFF CACHE BOOL "" )
if ( (NOT DEFINED CUDA_HOST_COMPILER) AND (${CMAKE_C_COMPILER} MATCHES "ccache") )
string ( STRIP "${CMAKE_C_COMPILER_ARG1}" stripped_compiler_string )
find_program ( CUDA_HOST_COMPILER ${stripped_compiler_string} )
endif ()
find_package ( CUDA REQUIRED )
if ( CUDA_FOUND )
include_directories ( ${CUDA_INCLUDE_DIRS} )
list ( APPEND SERVICE_LIBS ${CUDA_LIBRARIES} )
if ( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=" )
list ( APPEND CUDA_NVCC_FLAGS "-std=c++11" )
endif ()
# Bug with gcc5 and cuda7.5:
#list( APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES -D__STRICT_ANSI__")
# NOTICE: exisiting cuda flags are overwritten
#set ( CUDA_NVCC_FLAGS "--compiler-bindir=/usr/bin/g++-4.3" )
#set ( CUDA_NVCC_FLAGS "-arch sm_20" )
else()
set ( WALBERLA_BUILD_WITH_CUDA FALSE )
endif ( )
endif ( )
############################################################################################################################
############################################################################################################################ ############################################################################################################################
## ##
## Testing Coverage ## Testing Coverage
......
add_subdirectory(basics) add_subdirectory(basics)
add_subdirectory(cuda)
add_subdirectory(lbm) add_subdirectory(lbm)
add_subdirectory(pde) add_subdirectory(pde)
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file 03_GameOfLife.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "01_GameOfLife_kernels.h"
#include "cuda/HostFieldAllocator.h"
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformDirectScheme.h"
#include "core/Environment.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/FieldCopy.h"
#include "cuda/GPUField.h"
#include "cuda/Kernel.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "field/AddToStorage.h"
#include "field/communication/UniformMPIDatatypeInfo.h"
#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
#include "geometry/structured/GrayScaleImage.h"
#include "gui/Gui.h"
#include "stencil/D2Q9.h"
#include "timeloop/SweepTimeloop.h"
using namespace walberla;
typedef GhostLayerField<double,1> ScalarField;
typedef cuda::GPUField<double> GPUField;
ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new ScalarField (
storage->getNumberOfXCells( *block ), // number of cells in x direction per block
storage->getNumberOfYCells( *block ), // number of cells in y direction per block
storage->getNumberOfZCells( *block ), // number of cells in z direction per block
1, // one ghost layer
real_t(0), // initial value
field::fzyx, // layout
make_shared<cuda::HostFieldAllocator<double> >() // allocator for host pinned memory
);
}
class GameOfLifeSweepCUDA
{
public:
GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
: gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID )
{
}
void operator() ( IBlock * block )
{
auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
myKernel();
srcCudaField->swapDataPointers( dstCudaField );
}
private:
BlockDataID gpuFieldSrcID_;
BlockDataID gpuFieldDstID_;
};
int main( int argc, char ** argv )
{
walberla::Environment env( argc, argv );
geometry::GrayScaleImage image ("GosperGliderGun.png");
// Create blocks
shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
uint_t(1) , uint_t(2), uint_t(1), // number of blocks in x,y,z direction
image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
real_t(1), // dx: length of one cell in physical coordinates
false, // one block per process - "false" means all blocks to one process
false, false, false ); // no periodicity
BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
// Initializing the field from an image
using geometry::initializer::ScalarFieldFromGrayScaleImage;
ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
fieldInitializer.init( image, uint_t(2), false );
BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
CommScheme communication( blocks );
communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
// Create Timeloop
const uint_t numberOfTimesteps = uint_t(10); // number of timesteps for non-gui runs
SweepTimeloop timeloop ( blocks, numberOfTimesteps );
// Registering the sweep
timeloop.add() << BeforeFunction( communication, "Communication" )
<< Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
GUI gui ( timeloop, blocks, argc, argv );
gui.run();
return 0;
}
namespace walberla{
/**
\page tutorial_cuda01 Tutorial - CUDA 1: Game of Life on GPU
\image html tutorial_cuda01_nvidia_titan.png
> _Note:_ This tutorial required a CUDA aware MPI library.
> If you get a SEGFAULT when executing this tutorial, make sure that your MPI library was built with
> CUDA support! For instructions how to build OpenMPI with CUDA see this [page](https://www.open-mpi.org/faq/?category=building#build-cuda).
\section cuda01_fields Creating Fields
To run a simulation on a NVIDIA graphics card, we have to allocate data on the GPU and
write a CUDA kernel that operates on this data. In this tutorial we first allocate a field on the GPU
and learn about functionality to transfer data between CPU and GPU fields.
Since initialization and output routines are usually not time critical, they are implemented
for CPU fields only. In waLBerla we set up the complete simulation using
CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
end, copy everything back to do the output from the CPU field.
So only the time critical kernels have to be written in CUDA.
Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
in a previous tutorial ( \ref tutorial_basics_03 ).
One difference is, that fields which are often transfered from/to the GPU should be allocated with
a different field allocator: cuda::HostFieldAllocator . This allocator uses cudaHostAlloc() instead of "new" ,
such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
\code
ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new ScalarField (
storage->getNumberOfXCells( *block ), // number of cells in x direction per block
storage->getNumberOfYCells( *block ), // number of cells in y direction per block
storage->getNumberOfZCells( *block ), // number of cells in z direction per block
1, // one ghost layer
real_t(0), // initial value
field::fzyx, // layout
make_shared<cuda::HostFieldAllocator<double> >() // allocator for host pinned memory
);
}
\endcode
Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
Then two GPU fields are created: "source" and "destination" field. The helper function
cuda::addGPUFieldToStorage() creates a cuda::GPUField field of the same size and layout of the given
CPU field:
\code
BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
\endcode
The contents of the new GPU fields are initialized with the contents of the given CPU field.
\section cuda01_kernels Writing and calling CUDA kernels
For a basic understanding of the CUDA support in waLBerla please read \ref cudaPage first.
After reading this page you should know what a FieldAccessor is and how to call CUDA kernels from
cpp files. So we can now start with writing
a CUDA kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension.
The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
The kernel gets two field accessors as arguments, one for the source and one for the destination field.
Both accessors have to be configured using the CUDA variables blockIdx and threadIdx, such that afterwards
the get() and getNeighbor() functions of the accessor class can work correctly.
\code
__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst )
{
src.set( blockIdx, threadIdx );
dst.set( blockIdx, threadIdx );
int liveNeighbors = 0;
if ( src.getNeighbor( 1, 0,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
// normal Game of Life algorithm ....
// ...
}
\endcode
To call this kernel we write a thin wrapper sweep which only has to get the GPU fields out of the blockstorage
and passes them to the CUDA kernel. We use the cuda::Kernel class from waLBerla here, so that we can write this
sweep in a normal cpp file.
Here are the contents of this sweep:
\code
auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
myKernel();
srcCudaField->swapDataPointers( dstCudaField );
\endcode
All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
setup reasons.
To see if our kernel works, we copy the contents back to the CPU field after every timestep:
\code
timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
\endcode
Of course this makes no sense for real simulations, since the transfer time is much higher than the
time that was saved by doing the computation on the GPU. For production runs, one would usually transfer the
field back every n'th timestep and write e.g. a VTK frame.
\section cuda01_comm Communication
In waLBerla there are two types of communication: _buffered_ and _direct_ communication.
While buffered communication first collects all data in a buffer and sends only one message per communciation step and neighbor
the direct communciation strategy, which is based on MPI datatypes, uses no intermediate buffers and therefore has to send
more messages than buffered communication. For details see \ref walberla_communication .
In the tutorials up to now, only the buffered approach was used. In this tutorial, we switch to the direct communciation strategy
because then we can use the CUDA support of the MPI library to directly communciate from/to GPU memory.
The usage of the two different communication schemes is very similar. Instead of creating a blockforest::communication::UniformBufferedScheme
we create a blockforest::communication::UniformDirectScheme.
Then we register a field::communication::UniformMPIDatatypeInfo instead of the field::communication::PackInfo.
\code
typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
CommScheme communication( blocks );
communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
\endcode
This scheme also supports heterogenous simulations, i.e. using a CPU field on
some processes and a GPU field on other processes.
*/
}
#include "../cuda/01_GameOfLife_kernels.h"
#include <iostream>
namespace walberla {
__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst )
{
src.set( blockIdx, threadIdx );
dst.set( blockIdx, threadIdx );
// Count number of living neighbors
int liveNeighbors = 0;
if ( src.getNeighbor( 1, 0,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( 0,+1,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( 0,-1,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( -1, -1, 0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( -1, +1, 0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( +1, -1,0 ) > 0.5 ) ++liveNeighbors;
if ( src.getNeighbor( +1, +1,0 ) > 0.5 ) ++liveNeighbors;
// cell dies because of under- or over-population
if ( liveNeighbors < 2 || liveNeighbors > 3 )
dst.get() = 0.0;
else if ( liveNeighbors == 3 ) // cell comes alive
dst.get() = 1.0;
else
dst.get() = src.get();
}
} // namespace walberla
#include <iostream>
#include "cuda/FieldIndexing.h"
namespace walberla {
__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst );
} // namespace walberla
waLBerla_link_files_to_builddir( *.prm )
waLBerla_link_files_to_builddir( *.png )
waLBerla_add_executable ( NAME 01_GameOfLife_cuda
FILES 01_GameOfLife_cuda.cpp 01_GameOfLife_kernels.cu
DEPENDS blockforest core cuda field lbm geometry timeloop gui )
\ No newline at end of file
apps/tutorials/cuda/GosperGliderGun.png

228 B

...@@ -84,13 +84,17 @@ function ( waLBerla_add_module ) ...@@ -84,13 +84,17 @@ function ( waLBerla_add_module )
set( hasSourceFiles FALSE ) set( hasSourceFiles FALSE )
foreach ( sourceFile ${sourceFiles} ) foreach ( sourceFile ${sourceFiles} )
if ( ${sourceFile} MATCHES "\\.(c|cpp)" ) if ( ${sourceFile} MATCHES "\\.(c|cpp|cu)" )
set( hasSourceFiles TRUE ) set( hasSourceFiles TRUE )
endif( ) endif( )
endforeach( ) endforeach( )
if ( hasSourceFiles ) if ( hasSourceFiles )
add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} ) if ( CUDA_FOUND )
cuda_add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} )
else()
add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} )
endif( CUDA_FOUND )
else( ) else( )
add_custom_target( ${moduleLibraryName} SOURCES ${sourceFiles} ${otherFiles} ) # dummy IDE target add_custom_target( ${moduleLibraryName} SOURCES ${sourceFiles} ${otherFiles} ) # dummy IDE target
endif( ) endif( )
...@@ -194,7 +198,13 @@ function ( waLBerla_add_executable ) ...@@ -194,7 +198,13 @@ function ( waLBerla_add_executable )
endif ( ) endif ( )
endif() endif()
add_executable( ${ARG_NAME} ${sourceFiles} ) if ( CUDA_FOUND )
cuda_add_executable( ${ARG_NAME} ${sourceFiles} )
else()
add_executable( ${ARG_NAME} ${sourceFiles} )
endif()
#add_executable( ${ARG_NAME} ${sourceFiles} )
target_link_modules ( ${ARG_NAME} ${ARG_DEPENDS} ) target_link_modules ( ${ARG_NAME} ${ARG_DEPENDS} )
target_link_libraries( ${ARG_NAME} ${SERVICE_LIBS} ) target_link_libraries( ${ARG_NAME} ${SERVICE_LIBS} )
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file AddGPUFieldToStorage.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include "GPUField.h"
#include "domain_decomposition/StructuredBlockStorage.h"
#include <boost/bind.hpp>
namespace walberla {
namespace cuda {
//*******************************************************************************************************************
/*! Adds a cuda::GPUField to a StructuredBlockStorage
*
* - Similar to walberla::field::addToStorage() functions
* - created field is uninitialized
*/
//*******************************************************************************************************************
template< typename GPUField_T>
BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs,
const std::string & identifier,
uint_t fSize,
const Layout layout = fzyx,
uint_t nrOfGhostLayers = 1,
bool usePitchedMem = true );
//*******************************************************************************************************************
/*! Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field
*
* - adds a GPU field to a StructuredBlockStorage using a CPU field
* - sizes, number of ghostlayers and layout are the same as the CPU field
* - GPU field is initialized with the data currently stored in the CPU field
* @tparam Field_T type of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type>
*/
//*******************************************************************************************************************
template< typename Field_T>
BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
ConstBlockDataID cpuFieldID,
const std::string & identifier,
bool usePitchedMem = true );
} // namespace cuda
} // namespace walberla
#include "AddGPUFieldToStorage.impl.h"
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file AddGPUFieldToStorage.impl.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
namespace walberla {
namespace cuda {
namespace internal
{
template< typename GPUField_T>
GPUField_T * createGPUField( const IBlock * const block,
const StructuredBlockStorage * const bs,
uint_t ghostLayers,
uint_t fSize,
const field::Layout & layout,
bool usePitchedMem )
{
return new GPUField_T( bs->getNumberOfXCells( *block ),
bs->getNumberOfYCells( *block ),
bs->getNumberOfZCells( *block ),
fSize, ghostLayers, layout, usePitchedMem );
}
template< typename Field_T>
GPUField< typename Field_T::value_type> *
createGPUFieldFromCPUField( const IBlock * const block,
const StructuredBlockStorage * const,
ConstBlockDataID cpuFieldID,
bool usePitchedMem
)
{
typedef GPUField< typename Field_T::value_type> GPUField_T;
const Field_T * f = block->getData<Field_T>( cpuFieldID );
auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(),
f->nrOfGhostLayers(), f->layout(), usePitchedMem );
cuda::fieldCpy( *gpuField, *f );
return gpuField;
}
}
template< typename GPUField_T>
BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs,
const std::string & identifier,
uint_t fSize,
const Layout layout,
uint_t nrOfGhostLayers,
bool usePitchedMem )
{
auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout, usePitchedMem );
return bs->addStructuredBlockData< GPUField_T >( func, identifier );
}
template< typename Field_T>
BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
ConstBlockDataID cpuFieldID,
const std::string & identifier,
bool usePitchedMem )
{
auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID, usePitchedMem );
return bs->addStructuredBlockData< GPUField<typename Field_T::value_type> >( func, identifier );
}
} // namespace cuda
} // namespace walberla
###################################################################################################
#
# Module cuda
#
###################################################################################################
waLBerla_add_module( DEPENDS core communication domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA )
###################################################################################################
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file ErrorChecking.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include "core/Abort.h"
#include <sstream>
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
#define WALBERLA_CUDA_CHECK(ans) { ::walberla::cuda::checkForError((ans), __FILE__, __LINE__); }
inline void checkForError( cudaError_t code, const std::string & callerPath, const int line )
{
if(code != cudaSuccess)
{
std::stringstream ss;
ss << "CUDA Error: " << cudaGetErrorString( code );
Abort::instance()->abort( ss.str(), callerPath, line );
}
}
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file SimpleFieldAccessor.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include <cuda_runtime.h>
#include "core/DataTypes.h"
namespace walberla {
namespace cuda {
template<typename T>
class FieldAccessor
{
public:
enum IndexingScheme { FZYX, FZY, FZ, F,
ZYXF, ZYX, ZY, Z
};
FieldAccessor( char * ptr,
uint_t xOffset,
uint_t yOffset,
uint_t zOffset,
uint_t fOffset,
IndexingScheme indexingScheme )
: ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset),
fOffset_(fOffset), indexingScheme_(indexingScheme )
{}
__device__ void set( uint3 blockIdx, uint3 threadIdx )
{
switch ( indexingScheme_)
{
case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
case FZY : ptr_ += blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
case FZ : ptr_ += blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break;
case F : ptr_ += threadIdx.x * fOffset_; break;
case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break;
case ZYX : ptr_ += blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
case ZY : ptr_ += blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
case Z : ptr_ += threadIdx.x * zOffset_; break;
}
}
__device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
{
return threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x +
blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
}
// This is always true for this specific field indexing class.
__device__ __forceinline__ bool isValidPosition() { return true; }
__device__ T & get() { return * (T*)(ptr_); }
__device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
__device__ T & getNeighbor( int cx, int cy, int cz ) const
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ );
}
__device__ T & getNeighbor( int cx, int cy, int cz, int cf )
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ +
cf * fOffset_ );
}
protected:
char * ptr_;
uint_t xOffset_;
uint_t yOffset_;
uint_t zOffset_;
uint_t fOffset_;
IndexingScheme indexingScheme_;
};
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldAccessor3D.h
//! \ingroup cuda
//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
//
//======================================================================================================================
#pragma once
#include "core/DataTypes.h"
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
template<typename T>
class FieldAccessor3D
{
public:
FieldAccessor3D( char * ptr,
uint_t xOffset,
uint_t yOffset,
uint_t zOffset,
uint_t fOffset,
const dim3 & idxDim,
const dim3 & blockDim )
: ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false )
{}
__device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
{
uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
{
ptr_ += x * xOffset_ + y * yOffset_ + z * zOffset_;
isValidPosition_ = true;
}
}
__device__ __forceinline__ bool isValidPosition() { return isValidPosition_; }
__device__ __forceinline__ T & get() { return * (T*)(ptr_); }
__device__ __forceinline__ T & get( int f ) { return * (T*)(ptr_ + f * fOffset_); }
__device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz ) const
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ );
}
__device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz, int cf )
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ +
cf * fOffset_ );
}
protected:
char * ptr_;
uint_t xOffset_;
uint_t yOffset_;
uint_t zOffset_;
uint_t fOffset_;
dim3 idxDim_;
dim3 blockDim_;
bool isValidPosition_;
};
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldAccessorXYZ.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include <cuda_runtime.h>
#include "core/DataTypes.h"
namespace walberla {
namespace cuda {
template<typename T>
class FieldAccessorXYZ
{
public:
FieldAccessorXYZ( char * ptr, size_t xOffset, size_t yOffset, size_t zOffset, size_t fOffset )
: ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset)
{}
__device__ void set( uint3 blockIdx, uint3 threadIdx )
{
ptr_ += threadIdx.x * xOffset_ +
blockIdx.x * yOffset_ +
blockIdx.y * zOffset_ ;
}
__device__ T & get() { return * (T*)(ptr_); }
__device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
__device__ T & getNeighbor( int cx, int cy, int cz ) const
{
return * (T*)( ptr_ + cx * (int)(xOffset_) +
cy * (int)(yOffset_) +
cz * (int)(zOffset_) );
}
__device__ T & getNeighbor( int cx, int cy, int cz, int cf )
{
return * (T*)( ptr_ + cx * (int)(xOffset_) +
cy * (int)(yOffset_) +
cz * (int)(zOffset_) +
cf * (int)(fOffset_) );
}
protected:
char * ptr_;
size_t xOffset_;
size_t yOffset_;
size_t zOffset_;
size_t fOffset_;
};
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldCopy.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include "ErrorChecking.h"
#include "GPUField.h"
#include "domain_decomposition/StructuredBlockStorage.h"
#include "field/Field.h"
#include "field/GhostLayerField.h"
#include "core/Abort.h"
#include "core/logging/Logging.h"
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
template<typename DstType, typename SrcType>
void fieldCpy( const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID dstID, ConstBlockDataID srcID )
{
for ( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
{
DstType * dst = blockIt->getData<DstType>( dstID );
const SrcType * src = blockIt->getData<SrcType>( srcID );
fieldCpy( *dst, *src );
}
}
template<typename DstType, typename SrcType>
boost::function<void()> fieldCpyFunctor( const shared_ptr< StructuredBlockStorage > & blocks,
BlockDataID dstID, ConstBlockDataID srcID )
{
return boost::bind( fieldCpy<DstType,SrcType>, blocks, dstID, srcID );
}
template<typename DstType, typename SrcType>
void fieldCpySweepFunction( BlockDataID dstID, ConstBlockDataID srcID, IBlock * block )
{
DstType * dst = block->getData<DstType>( dstID );
const SrcType * src = block->getData<SrcType>( srcID );
fieldCpy( *dst, *src );
}
template<typename DstType, typename SrcType>
boost::function<void(IBlock*)> fieldCpyFunctor( BlockDataID dstID, ConstBlockDataID srcID )
{
return boost::bind( fieldCpySweepFunction<DstType,SrcType>, dstID, srcID, _1 );
}
template<typename T, uint_t fs>
void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src );
template<typename T, uint_t fs>
void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src );
//===================================================================================================================
//
// Implementation
//
//===================================================================================================================
template<typename T, uint_t fs>
void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src )
{
cudaMemcpy3DParms p;
memset( &p, 0, sizeof(p) );
if ( dst.layout() != src.layout() ) {
WALBERLA_ABORT( "Cannot copy fields with different layout" );
}
bool canCopy = ( src.layout() == fzyx &&
dst.fAllocSize() == src.fAllocSize() &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xSize() == src.xSize() )
||
( src.layout() == zyxf &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xAllocSize() == src.xAllocSize() &&
dst.fSize() == src.fSize() );
if ( !canCopy ) {
WALBERLA_ABORT("Field have to have the same size ");
}
if ( dst.layout() == fzyx )
{
p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer
sizeof(T) * src.xAllocSize(), // pitch
src.xAllocSize(), // inner dimension size
src.yAllocSize() ); // next outer dimension size
p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
p.extent.height = dst.yAllocSize();
p.extent.depth = dst.zAllocSize() * dst.fAllocSize();
}
else
{
p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer
sizeof(T) * src.fAllocSize(), // pitch
src.fAllocSize(), // inner dimension size
src.xAllocSize() ); // next outer dimension size
p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
p.extent.height = dst.xAllocSize();
p.extent.depth = dst.yAllocSize() * dst.zAllocSize();
}
p.dstPtr = dst.pitchedPtr();
p.kind = cudaMemcpyHostToDevice;
WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
}
template<typename T, uint_t fs>
void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src )
{
cudaMemcpy3DParms p;
memset( &p, 0, sizeof(p) );
if ( dst.layout() != src.layout() ) {
WALBERLA_ABORT( "Cannot copy fields with different layout" );
}
bool canCopy = ( src.layout() == fzyx &&
dst.fAllocSize() == src.fAllocSize() &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xSize() == src.xSize() )
||
( src.layout() == zyxf &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xAllocSize() == src.xAllocSize() &&
dst.fSize() == src.fSize() );
if ( !canCopy ) {
WALBERLA_ABORT("Field have to have the same size ");
}
if ( dst.layout() == fzyx )
{
p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()), // pointer
sizeof(T) * dst.xAllocSize(), // pitch
dst.xAllocSize(), // inner dimension size
dst.yAllocSize() ); // next outer dimension size
p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
p.extent.height = dst.yAllocSize();
p.extent.depth = dst.zAllocSize() * dst.fAllocSize();
}
else
{
p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()), // pointer
sizeof(T) * dst.fAllocSize(), // pitch
dst.fAllocSize(), // inner dimension size
dst.xAllocSize() ); // next outer dimension size
p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
p.extent.height = dst.xAllocSize();
p.extent.depth = dst.yAllocSize() * dst.zAllocSize();
}
p.srcPtr = src.pitchedPtr();
p.kind = cudaMemcpyDeviceToHost;
WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
}
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file SimpleFieldIndexing.cpp
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "FieldIndexing.h"
#include "GPUTypesExplicitInstantiation.h"
#include "GPUField.h"
#include "core/cell/CellInterval.h"
#include "core/debug/Debug.h"
#include "core/logging/Logging.h"
#include "field/Layout.h"
#include <cuda_runtime.h>
#include <limits>
#include <cmath>
namespace walberla {
namespace cuda {
template< typename T>
FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
dim3 _blockDim, dim3 _gridDim,
const FieldAccessor<T> _gpuAccess )
: field_( field ),
blockDim_( _blockDim ),
gridDim_( _gridDim ),
gpuAccess_( _gpuAccess )
{
WALBERLA_DEBUG_SECTION()
{
cudaDeviceProp prop;
int count;
cudaGetDeviceCount(&count);
int threadsPerBlock = std::numeric_limits<int>::max();
for (int i = 0; i < count; i++) {
cudaGetDeviceProperties(&prop, i);
threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
}
WALBERLA_ASSERT_LESS_MSG( int_c( blockDim_.x ), threadsPerBlock,
"InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
"than the maximal thread count per CUDA block." );
}
}
template< typename T>
void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::IndexingScheme & indexing, dim3 & gridDim, dim3 & blockDim )
{
bool runLoop = true;
while( blockDim.x == 1 && runLoop )
{
blockDim.x = gridDim.x;
gridDim.x = gridDim.y;
gridDim.y = gridDim.z;
gridDim.z = 1;
switch ( indexing ) {
case FieldAccessor<T>::FZYX: indexing = FieldAccessor<T>::FZY; break;
case FieldAccessor<T>::FZY : indexing = FieldAccessor<T>::FZ ; break;
case FieldAccessor<T>::FZ : indexing = FieldAccessor<T>::F ; break;
case FieldAccessor<T>::ZYXF: indexing = FieldAccessor<T>::ZYX ; break;
case FieldAccessor<T>::ZYX : indexing = FieldAccessor<T>::ZY ; break;
case FieldAccessor<T>::ZY : indexing = FieldAccessor<T>::Z ; break;
// iteration goes only over a single element - stop the loop
case FieldAccessor<T>::Z: runLoop = false; break;
case FieldAccessor<T>::F: runLoop = false; break;
}
}
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd )
{
uint_t xOffset, yOffset, zOffset, fOffset;
if ( f.layout() == field::zyxf )
{
fOffset = sizeof(T);
xOffset = f.pitchedPtr().pitch;
yOffset = xOffset * f.xAllocSize();
zOffset = yOffset * f.yAllocSize();
}
else
{
xOffset = sizeof(T);
yOffset = f.pitchedPtr().pitch;
zOffset = yOffset * f.yAllocSize();
fOffset = zOffset * f.zAllocSize();
}
char * data = (char*)f.pitchedPtr().ptr;
// Jump over ghost cells to first inner cell
cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
data += ( ci.xMin() + gl )* xOffset +
( ci.yMin() + gl )* yOffset +
( ci.zMin() + gl )* zOffset;
dim3 gridDim;
dim3 blockDim;
typename FieldAccessor<T>::IndexingScheme firstCoord;
if ( f.layout() == fzyx )
{
firstCoord = FieldAccessor<T>::FZYX;
blockDim = dim3 ( (unsigned int)ci.xSize(), 1, 1 );
gridDim = dim3 ( (unsigned int)ci.ySize(), (unsigned int)ci.zSize(), (unsigned int)( fEnd - fBegin) );
}
else
{
firstCoord = FieldAccessor<T>::ZYXF;
blockDim = dim3 ( (unsigned int)(fEnd - fBegin), 1, 1 );
gridDim = dim3 ( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
}
shiftCoordinatesWhileFastestCoordHasSizeOne<T>( firstCoord, gridDim, blockDim );
return FieldIndexing<T> ( f, blockDim, gridDim,
FieldAccessor<T>( data, xOffset, yOffset, zOffset, fOffset, firstCoord ) );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::xyz ( const GPUField<T> & f )
{
CellInterval ci ( 0,0,0,
cell_idx_c( f.xSize() ) - 1,
cell_idx_c( f.ySize() ) - 1,
cell_idx_c( f.zSize() ) - 1 );
return interval( f, ci, 0,1 );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers )
{
cell_idx_t gl = std::min( cell_idx_c(numGhostLayers), cell_idx_c( f.nrOfGhostLayers() ) );
CellInterval ci ( -gl,-gl,-gl,
cell_idx_c( f.xSize() ) + gl - 1,
cell_idx_c( f.ySize() ) + gl - 1,
cell_idx_c( f.zSize() ) + gl - 1 );
return interval( f, ci, 0, 1 );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice );
return interval( f, ci, 0, 1 );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice );
return interval( f, ci, 0, 1 );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice );
return interval( f, ci );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::allInner ( const GPUField<T> & f )
{
CellInterval ci ( 0,0,0,
cell_idx_c( f.xSize() ) - 1,
cell_idx_c( f.ySize() ) - 1,
cell_idx_c( f.zSize() ) - 1 );
return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::allWithGhostLayer ( const GPUField<T> & f )
{
cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
CellInterval ci ( -gl,-gl,-gl,
cell_idx_c( f.xSize() ) + gl - 1,
cell_idx_c( f.ySize() ) + gl - 1,
cell_idx_c( f.zSize() ) + gl - 1 );
return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::CellInterval & ci )
{
return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
}
GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing )
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file SimpleFieldIndexing.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
//
//======================================================================================================================
#pragma once
#include "FieldAccessor.h"
#include "stencil/Directions.h"
#include <cuda_runtime.h>
namespace walberla { namespace cell { class CellInterval; } }
namespace walberla {
namespace cuda {
// Forward Declarations
template< typename T> class GPUField;
template<typename T>
class FieldIndexing
{
public:
//** Kernel call ******************************************************************************************
/*! \name Kernel call */
//@{
dim3 blockDim() const { return blockDim_; }
dim3 gridDim () const { return gridDim_; }
const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; }
//@}
//****************************************************************************************************************
//** Creation *********************************************************************************************
/*! \name Creation */
//@{
static FieldIndexing<T> interval ( const GPUField<T> & f,
const cell::CellInterval & ci,
int fBegin=0, int fEnd=1 );
static FieldIndexing<T> xyz ( const GPUField<T> & f );
static FieldIndexing<T> withGhostLayerXYZ ( const GPUField<T> & f, uint_t numGhostLayers );
static FieldIndexing<T> ghostLayerOnlyXYZ ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> sliceXYZ ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> allInner ( const GPUField<T> & f );
static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f );
static FieldIndexing<T> all ( const GPUField<T> & f, const cell::CellInterval & ci );
//@}
//****************************************************************************************************************
protected:
FieldIndexing ( const GPUField<T> & field,
dim3 _blockDim, dim3 _gridDim,
const FieldAccessor<T> _gpuAccess );
const GPUField<T> & field_;
dim3 blockDim_;
dim3 gridDim_;
FieldAccessor<T> gpuAccess_;
};
} // namespace cuda
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldIndexing3D.cpp
//! \ingroup cuda
//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
//
//======================================================================================================================
#include "FieldIndexing3D.h"
#include "GPUTypesExplicitInstantiation.h"
#include "GPUField.h"
#include "core/cell/CellInterval.h"
#include "core/debug/Debug.h"
#include "core/logging/Logging.h"
#include "field/Layout.h"
#include <cuda_runtime.h>
#include <limits>
#include <cmath>
#define BLOCK_SIZE_X uint_t( 32 )
#define BLOCK_SIZE_Y uint_t( 2 )
#define BLOCK_SIZE_Z uint_t( 2 )
namespace walberla {
namespace cuda {
// Returns ( a % b != 0 ) ? ( a / b + 1 ) : ( a / b )
inline unsigned int iDivUp( unsigned int a, unsigned int b ) { return ( a + b - 1 ) / b; }
dim3 FieldIndexing3DBase::preferredBlockDim_( BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z );
template< typename T>
FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field,
const dim3 & _blockDim, const dim3 & _gridDim,
const FieldAccessor3D<T> & _gpuAccess )
: field_( field ),
blockDim_( _blockDim ),
gridDim_( _gridDim ),
gpuAccess_( _gpuAccess )
{
WALBERLA_DEBUG_SECTION()
{
cudaDeviceProp prop;
int count;
cudaGetDeviceCount(&count);
int threadsPerBlock = std::numeric_limits<int>::max();
for (int i = 0; i < count; i++) {
cudaGetDeviceProperties(&prop, i);
threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
}
WALBERLA_ASSERT_LESS_MSG( int_c( blockDim_.x ), threadsPerBlock,
"InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
"than the maximal thread count per CUDA block." );
}
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const CellInterval & ci )
{
uint_t xOffset, yOffset, zOffset, fOffset;
WALBERLA_ASSERT( f.layout() == field::fzyx );
xOffset = sizeof(T);
yOffset = f.pitchedPtr().pitch;
zOffset = yOffset * f.yAllocSize();
fOffset = zOffset * f.zAllocSize();
char * data = (char*)f.pitchedPtr().ptr;
// position data according to ci
cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
data += ( ci.xMin() + gl ) * xOffset +
( ci.yMin() + gl ) * yOffset +
( ci.zMin() + gl ) * zOffset;
dim3 idxDim( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
unsigned int bx = std::min( preferredBlockDim_.x, idxDim.x );
unsigned int by = std::min( preferredBlockDim_.y, idxDim.y );
unsigned int bz = std::min( preferredBlockDim_.z, idxDim.z );
dim3 gridDim( iDivUp( idxDim.x, bx ),
iDivUp( idxDim.y, by ),
iDivUp( idxDim.z, bz ) );
dim3 blockDim( bx, by, bz );
return FieldIndexing3D<T>( f, blockDim, gridDim,
FieldAccessor3D<T>( data, xOffset, yOffset, zOffset, fOffset,
idxDim, blockDim ) );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::xyz ( const GPUField<T> & f )
{
CellInterval ci( 0,0,0,
cell_idx_c( f.xSize() ) - 1,
cell_idx_c( f.ySize() ) - 1,
cell_idx_c( f.zSize() ) - 1 );
return interval( f, ci );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers )
{
cell_idx_t gl = std::min( cell_idx_c( numGhostLayers ), cell_idx_c( f.nrOfGhostLayers() ) );
CellInterval ci( -gl,-gl,-gl,
cell_idx_c( f.xSize() ) + gl - 1,
cell_idx_c( f.ySize() ) + gl - 1,
cell_idx_c( f.zSize() ) + gl - 1 );
return interval( f, ci );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice );
return interval( f, ci );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice );
return interval( f, ci );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice );
return interval( f, ci );
}
template< typename T>
FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const cell::CellInterval & ci )
{
return interval( f, ci );
}
GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing3D )
} // namespace cuda
} // namespace walberla