diff --git a/python/waLBerla_tests/test_cuda_comm.py b/python/waLBerla_tests/test_cuda_comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c94f2cb6ac6686e7aaad012448f4f8a696ba4e59
--- /dev/null
+++ b/python/waLBerla_tests/test_cuda_comm.py
@@ -0,0 +1,25 @@
+from waLBerla import *
+import numpy as np
+import pycuda.autoinit
+from pycuda.gpuarray import *
+from pycuda import *
+from pystencils.field import createNumpyArrayWithLayout, getLayoutOfArray
+
+blocks = createUniformBlockGrid( cells=(1,1,1), periodic=(1,1,1) )
+cuda.addGpuFieldToStorage(blocks, "gpuField", float, fSize=1, ghostLayers=1, layout=field.fzyx, usePitchedMem=False)
+
+gpuArr = cuda.toGpuArray(blocks[0]['gpuField'])
+
+testField = createNumpyArrayWithLayout(gpuArr.shape, getLayoutOfArray(gpuArr))
+testField[...] = 0
+testField[1,1,1,0] = 1
+gpuArr.set(testField)
+
+scheme = createUniformBufferedScheme(blocks, "D3Q27")
+scheme.addDataToCommunicate( cuda.createPackInfo(blocks, "gpuField") )
+
+scheme()
+
+gpuArr = cuda.toGpuArray(blocks[0]['gpuField'])
+
+assert(np.allclose(np.ones([3,3,3,1]), gpuArr.get()))
diff --git a/src/cuda/FieldIndexing3D.h b/src/cuda/FieldIndexing3D.h
index c6637ec6cef201b4c4ad94f56f0898e03484763e..ba93f83c6148de01e22c8d8284a441228c98e8be 100644
--- a/src/cuda/FieldIndexing3D.h
+++ b/src/cuda/FieldIndexing3D.h
@@ -103,4 +103,4 @@ namespace cuda {
 } // namespace walberla
 
 
-#include "FieldIndexing.impl.h"
\ No newline at end of file
+#include "FieldIndexing3D.impl.h"
\ No newline at end of file
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 74769cbbc2458c61dc44da91605b976904adc927..62e1c45c501d58967725551e5c39bf7b8086fb9e 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -7,18 +7,20 @@
 waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest )
 waLBerla_execute_test( NAME  GPUPackInfoTest )
 
-waLBerla_compile_test( FILES communication/CommTest )
-waLBerla_execute_test( NAME  CommTest )
-
 waLBerla_compile_test( FILES FieldTransferTest )
 waLBerla_execute_test( NAME  FieldTransferTest )
 
 waLBerla_compile_test( FILES SimpleKernelTest.cpp Kernels.cu DEPENDS blockforest timeloop gui )
 waLBerla_execute_test( NAME  SimpleKernelTest )
 
-waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
-waLBerla_execute_test( NAME  CudaMPI )
-
 waLBerla_compile_test( FILES FieldIndexing3DTest.cpp FieldIndexing3DTest.cu )
 waLBerla_execute_test( NAME  FieldIndexing3DTest )
 
+
+
+# The following tests work only for CUDA enabled MPI
+waLBerla_compile_test( FILES communication/CommTest )
+#waLBerla_execute_test( NAME  CommTest PROCESSES 2)
+
+waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
+#waLBerla_execute_test( NAME  CudaMPI )
diff --git a/tests/cuda/communication/CommTest.cpp b/tests/cuda/communication/CommTest.cpp
index 0b7378060ad58c8813efa3d3a1e2b88d4a72ee01..ada3c766f8a3854e5ae4ec2e9e761704c1bc75a6 100644
--- a/tests/cuda/communication/CommTest.cpp
+++ b/tests/cuda/communication/CommTest.cpp
@@ -228,6 +228,8 @@ int main( int argc, char ** argv )
    debug::enterTestMode();
    walberla::Environment walberlaEnv( argc, argv );
 
+	WALBERLA_CHECK_EQUAL(MPIManager::instance()->numProcesses(), 2);
+
    hostToHost();
    hostToDevice();
    deviceToHost();