diff --git a/CMakeLists.txt b/CMakeLists.txt
index c50270da0f7a63452e26c1e7c117a9e08238ade0..c4f09268258745d87e72ecc4fc2cd370904f7694 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1108,6 +1108,12 @@ if ( WALBERLA_BUILD_WITH_CUDA )
 
         list( APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
 
+        find_library( NVTX_LIBRARY nvToolsExt PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 )
+        if( NVTX_LIBRARY )
+            set( WALBERLA_CUDA_NVTX_AVAILABLE 1)
+            list ( APPEND SERVICE_LIBS ${NVTX_LIBRARY} )
+        endif()
+
         if ( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=" AND NOT WALBERLA_CXX_COMPILER_IS_MSVC )
             list ( APPEND CUDA_NVCC_FLAGS "-std=c++14" )
         endif ()
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index e8c9299c47e4e949fbb06983cceef7e6a4109361..db5e25972442ba7fb2384d71b81dc42fe661aa8f 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -18,6 +18,7 @@
 #include "cuda/HostFieldAllocator.h"
 #include "cuda/communication/GPUPackInfo.h"
 #include "cuda/ParallelStreams.h"
+#include "cuda/NVTX.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
 #include "cuda/AddGPUFieldToStorage.h"
@@ -115,6 +116,7 @@ int main( int argc, char **argv )
 
          innerOuterSection.run([&]( auto innerStream )
          {
+            cuda::nameStream(innerStream, "inner stream");
             for( auto &block: *blocks )
             {
                if(!disableBoundaries)
@@ -129,6 +131,7 @@ int main( int argc, char **argv )
 
          innerOuterSection.run([&]( auto outerStream )
          {
+            cuda::nameStream(outerStream, "inner stream");
             gpuComm( outerStream );
 
             for( auto &block: *blocks )
diff --git a/src/cuda/NVTX.h b/src/cuda/NVTX.h
new file mode 100644
index 0000000000000000000000000000000000000000..3943581afcb7d56076bfac683be6e7aa049f7038
--- /dev/null
+++ b/src/cuda/NVTX.h
@@ -0,0 +1,97 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NVTX.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+
+#include <string>
+
+#ifdef WALBERLA_CUDA_NVTX_AVAILABLE
+#include <nvToolsExt.h>
+#include <nvToolsExtCuda.h>
+#include <nvToolsExtCudaRt.h>
+
+namespace walberla{
+namespace cuda {
+
+inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa)
+{
+    nvtxEventAttributes_t eventAttrib;
+    memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.color = 0xFF000000 | color;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.message.ascii = name.c_str();
+    nvtxMarkEx(&eventAttrib);
+}
+
+inline void nameStream(const cudaStream_t & stream, const std::string & name)
+{
+    nvtxNameCudaStreamA(stream, name.c_str());
+}
+
+class NvtxRange
+{
+public:
+    NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa)
+    {
+        memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+        eventAttrib.version = NVTX_VERSION;
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+        eventAttrib.colorType = NVTX_COLOR_ARGB;
+        eventAttrib.color = 0xFF000000 | color;
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        eventAttrib.message.ascii = name.c_str();
+        nvtxRangePushEx(&eventAttrib);
+    }
+    ~NvtxRange()
+    {
+        nvtxRangePop();
+    }
+private:
+    nvtxEventAttributes_t eventAttrib;
+};
+
+
+} // namespace cuda
+} // namespace walberla
+
+
+
+
+#else
+namespace walberla{
+namespace cuda {
+
+inline void nameStream(const cudaStream_t & stream, const std::string & name) {}
+inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa) {}
+class NvtxRange
+{
+public:
+    NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa) {}}
+};
+
+} // namespace cuda
+} // namespace walberla
+
+
+#endif
\ No newline at end of file
diff --git a/src/waLBerlaDefinitions.in.h b/src/waLBerlaDefinitions.in.h
index 82f38181f2388727934ded2e08f302e34369b7e1..0d3ed2a75744723af5b9bc1ecd5553252a18504f 100644
--- a/src/waLBerlaDefinitions.in.h
+++ b/src/waLBerlaDefinitions.in.h
@@ -32,6 +32,7 @@
 #cmakedefine WALBERLA_BUILD_WITH_OPENMESH
 
 #cmakedefine WALBERLA_BUILD_WITH_CUDA
+#cmakedefine WALBERLA_CUDA_NVTX_AVAILABLE
 
 #cmakedefine WALBERLA_BUILD_WITH_CODEGEN