diff --git a/__init__.py b/__init__.py
index d7ede400e193784e4c0b0c4c8ab542737f56ecb6..c44788a53a65a4459bccdbcd5df2583ee6ad835c 100644
--- a/__init__.py
+++ b/__init__.py
@@ -13,6 +13,7 @@ from .kernel_decorator import kernel
 from .stencils import visualize_stencil_expression
 from . import fd
 
+
 __all__ = ['Field', 'FieldType', 'fields',
            'TypedSymbol',
            'make_slice',
diff --git a/backends/cbackend.py b/backends/cbackend.py
index a11ef1305356f9897f0a9b944cb371874f239b74..83dfc48ff65aa88bee5abe2be26154f978c9dc01 100644
--- a/backends/cbackend.py
+++ b/backends/cbackend.py
@@ -14,10 +14,10 @@ from pystencils.astnodes import Node, KernelFunction
 from pystencils.data_types import create_type, PointerType, get_type_of_expression, VectorType, cast_func, \
     vector_memory_access
 
-__all__ = ['generate_c', 'CustomCppCode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
+__all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
 
 
-def generate_c(ast_node: Node, signature_only: bool = False) -> str:
+def generate_c(ast_node: Node, signature_only: bool = False, dialect='c') -> str:
     """Prints an abstract syntax tree node as C or CUDA code.
 
     This function does not need to distinguish between C, C++ or CUDA code, it just prints 'C-like' code as encoded
@@ -27,12 +27,13 @@ def generate_c(ast_node: Node, signature_only: bool = False) -> str:
     Args:
         ast_node:
         signature_only:
-
+        dialect: 'c' or 'cuda'
     Returns:
         C-like code for the ast node and its descendants
     """
     printer = CBackend(signature_only=signature_only,
-                       vector_instruction_set=ast_node.instruction_set)
+                       vector_instruction_set=ast_node.instruction_set,
+                       dialect=dialect)
     return printer(ast_node)
 
 
@@ -55,16 +56,15 @@ def get_headers(ast_node: Node) -> Set[str]:
 # --------------------------------------- Backend Specific Nodes -------------------------------------------------------
 
 
-class CustomCppCode(Node):
+class CustomCodeNode(Node):
     def __init__(self, code, symbols_read, symbols_defined, parent=None):
-        super(CustomCppCode, self).__init__(parent=parent)
+        super(CustomCodeNode, self).__init__(parent=parent)
         self._code = "\n" + code
         self._symbolsRead = set(symbols_read)
         self._symbolsDefined = set(symbols_defined)
         self.headers = []
 
-    @property
-    def code(self):
+    def get_code(self, dialect, vector_instruction_set):
         return self._code
 
     @property
@@ -80,7 +80,7 @@ class CustomCppCode(Node):
         return self.symbols_defined - self._symbolsRead
 
 
-class PrintNode(CustomCppCode):
+class PrintNode(CustomCodeNode):
     # noinspection SpellCheckingInspection
     def __init__(self, symbol_to_print):
         code = '\nstd::cout << "%s  =  " << %s << std::endl; \n' % (symbol_to_print.name, symbol_to_print.name)
@@ -95,7 +95,7 @@ class PrintNode(CustomCppCode):
 class CBackend:
 
     def __init__(self, sympy_printer=None,
-                 signature_only=False, vector_instruction_set=None):
+                 signature_only=False, vector_instruction_set=None, dialect='c'):
         if sympy_printer is None:
             if vector_instruction_set is not None:
                 self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
@@ -104,13 +104,14 @@ class CBackend:
         else:
             self.sympy_printer = sympy_printer
 
-        self._vectorInstructionSet = vector_instruction_set
+        self._vector_instruction_set = vector_instruction_set
         self._indent = "   "
+        self._dialect = dialect
         self._signatureOnly = signature_only
 
     def __call__(self, node):
         prev_is = VectorType.instruction_set
-        VectorType.instruction_set = self._vectorInstructionSet
+        VectorType.instruction_set = self._vector_instruction_set
         result = str(self._print(node))
         VectorType.instruction_set = prev_is
         return result
@@ -120,7 +121,6 @@ class CBackend:
             method_name = "_print_" + cls.__name__
             if hasattr(self, method_name):
                 return getattr(self, method_name)(node)
-
         raise NotImplementedError("CBackend does not support node of type " + str(type(node)))
 
     def _print_KernelFunction(self, node):
@@ -170,8 +170,8 @@ class CBackend:
                 else:
                     rhs = node.rhs
 
-                return self._vectorInstructionSet[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
-                                                                self.sympy_printer.doprint(rhs)) + ';'
+                return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
+                                                                  self.sympy_printer.doprint(rhs)) + ';'
             else:
                 return "%s = %s;" % (self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs))
 
@@ -191,9 +191,8 @@ class CBackend:
         align = 64
         return "free(%s - %d);" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
 
-    @staticmethod
-    def _print_CustomCppCode(node):
-        return node.code
+    def _print_CustomCodeNode(self, node):
+        return node.get_code(self._dialect, self._vector_instruction_set)
 
     def _print_Conditional(self, node):
         condition_expr = self.sympy_printer.doprint(node.condition_expr)
diff --git a/boundaries/boundaryhandling.py b/boundaries/boundaryhandling.py
index 69c992c3b6d5a1ec32425496625b2e08fd3339d2..384c10d4ff30b0ad983fac2dabd28a72df54a979 100644
--- a/boundaries/boundaryhandling.py
+++ b/boundaries/boundaryhandling.py
@@ -2,7 +2,7 @@ import numpy as np
 import sympy as sp
 from pystencils.assignment import Assignment
 from pystencils import Field, TypedSymbol, create_indexed_kernel
-from pystencils.backends.cbackend import CustomCppCode
+from pystencils.backends.cbackend import CustomCodeNode
 from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array
 from pystencils.cache import memorycache
 from pystencils.data_types import create_type
@@ -379,7 +379,7 @@ class BoundaryDataSetter:
         return self.index_array[item]
 
 
-class BoundaryOffsetInfo(CustomCppCode):
+class BoundaryOffsetInfo(CustomCodeNode):
 
     # --------------------------- Functions to be used by boundaries --------------------------
 
diff --git a/cpu/cpujit.py b/cpu/cpujit.py
index d5c38629be18bf239d1be5d4c8d56edd620e3fdc..56cd2cc3a728e36a2aca585599fcc7066b226051 100644
--- a/cpu/cpujit.py
+++ b/cpu/cpujit.py
@@ -61,6 +61,7 @@ from sysconfig import get_paths
 from pystencils import FieldType
 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.utils import file_handle_for_atomic_write, atomic_file_write
+from pystencils.include import get_pystencils_include_path
 
 
 def make_python_function(kernel_function_node):
@@ -463,7 +464,7 @@ class KernelWrapper:
 
 def compile_module(code, code_hash, base_dir):
     compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include']]
+    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()]
 
     if compiler_config['os'].lower() == 'windows':
         function_prefix = '__declspec(dllexport)'
@@ -510,7 +511,7 @@ def compile_module(code, code_hash, base_dir):
 
 def compile_and_load(ast):
     cache_config = get_cache_config()
-    code_hash_str = "mod_" + hashlib.sha256(generate_c(ast).encode()).hexdigest()
+    code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest()
     code = ExtensionModuleCode(module_name=code_hash_str)
     code.add_function(ast, ast.function_name)
 
diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py
index 8383813a41f65c0db60940ac03dc999e2eb8853e..ee40bd0c5ae81130151fa9c0b0e083a335ef6d3c 100644
--- a/gpucuda/cudajit.py
+++ b/gpucuda/cudajit.py
@@ -1,8 +1,9 @@
 import numpy as np
-from pystencils.backends.cbackend import generate_c
+from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.kernelparameters import FieldPointerSymbol
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
+from pystencils.include import get_pystencils_include_path
 
 
 def make_python_function(kernel_function_node, argument_dict=None):
@@ -25,12 +26,15 @@ def make_python_function(kernel_function_node, argument_dict=None):
     if argument_dict is None:
         argument_dict = {}
 
-    code = "#include <cstdint>\n"
+    header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node))
+    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+
+    code = includes + "\n"
     code += "#define FUNC_PREFIX __global__\n"
     code += "#define RESTRICT __restrict__\n\n"
-    code += str(generate_c(kernel_function_node))
-
-    mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"])
+    code += str(generate_c(kernel_function_node, dialect='cuda'))
+    mod = SourceModule(code, options=["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"],
+                       include_dirs=[get_pystencils_include_path()])
     func = mod.get_function(kernel_function_node.function_name)
 
     parameters = kernel_function_node.get_parameters()
diff --git a/include/__init__.py b/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a19735db12d1de30f46104f85c0cb9f7f6da20
--- /dev/null
+++ b/include/__init__.py
@@ -0,0 +1,5 @@
+import os
+
+
+def get_pystencils_include_path():
+    return os.path.dirname(os.path.realpath(__file__))
diff --git a/include/philox_rand.h b/include/philox_rand.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbf1c1ac0ed015539743e4f027ed875209d08f61
--- /dev/null
+++ b/include/philox_rand.h
@@ -0,0 +1,104 @@
+#include <cstdint>
+
+#ifndef __CUDA_ARCH__
+#define QUALIFIERS inline
+#else
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+
+#define PHILOX_W32_0   (0x9E3779B9)
+#define PHILOX_W32_1   (0xBB67AE85)
+#define PHILOX_M4x32_0 (0xD2511F53)
+#define PHILOX_M4x32_1 (0xCD9E8D57)
+#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
+#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
+
+typedef std::uint32_t uint32;
+typedef std::uint64_t uint64;
+
+
+QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
+{
+#ifndef __CUDA_ARCH__
+    // host code
+    uint64 product = ((uint64)a) * ((uint64)b);
+    *hip = product >> 32;
+    return (uint32)product;
+#else
+    // device code
+    *hip = __umulhi(a,b);
+    return a*b;
+#endif
+}
+
+QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
+{
+    uint32 hi0;
+    uint32 hi1;
+    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
+    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
+
+    ctr[0] = hi1^ctr[1]^key[0];
+    ctr[1] = lo1;
+    ctr[2] = hi0^ctr[3]^key[1];
+    ctr[3] = lo0;
+}
+
+QUALIFIERS void _philox4x32bumpkey(uint32* key)
+{
+    key[0] += PHILOX_W32_0;
+    key[1] += PHILOX_W32_1;
+}
+
+QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
+{
+    unsigned long long z = (unsigned long long)x ^
+        ((unsigned long long)y << (53 - 32));
+    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
+}
+
+
+QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
+{
+    uint32 key[2] = {key0, key1};
+    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+
+    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
+    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
+}
+
+
+
+QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+{
+    uint32 key[2] = {key0, key1};
+    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+
+    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+}
\ No newline at end of file
diff --git a/kerncraft_coupling/generate_benchmark.py b/kerncraft_coupling/generate_benchmark.py
index ed0e65999efd15a08e4f95de46913640033f1644..2c27f3b0e7e5412fb7cdda248cf1072b8e835ef7 100644
--- a/kerncraft_coupling/generate_benchmark.py
+++ b/kerncraft_coupling/generate_benchmark.py
@@ -100,7 +100,7 @@ def generate_benchmark(ast, likwid=False):
 
     args = {
         'likwid': likwid,
-        'kernel_code': generate_c(ast),
+        'kernel_code': generate_c(ast, dialect='c'),
         'kernelName': ast.function_name,
         'fields': fields,
         'constants': constants,
diff --git a/random.py b/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3f7e29dde17e74ed4cf6548ccd16875cd5624c
--- /dev/null
+++ b/random.py
@@ -0,0 +1,111 @@
+import sympy as sp
+import numpy as np
+from pystencils import TypedSymbol
+from pystencils.astnodes import LoopOverCoordinate
+from pystencils.backends.cbackend import CustomCodeNode
+
+philox_two_doubles_call = """
+{result_symbols[0].dtype} {result_symbols[0].name};
+{result_symbols[1].dtype} {result_symbols[1].name};
+philox_double2({parameters}, {result_symbols[0].name}, {result_symbols[1].name});
+"""
+
+philox_four_floats_call = """
+{result_symbols[0].dtype} {result_symbols[0].name};
+{result_symbols[1].dtype} {result_symbols[1].name};
+{result_symbols[2].dtype} {result_symbols[2].name};
+{result_symbols[3].dtype} {result_symbols[3].name};
+philox_float4({parameters}, 
+              {result_symbols[0].name}, {result_symbols[1].name}, {result_symbols[2].name}, {result_symbols[3].name});
+
+"""
+
+
+class PhiloxTwoDoubles(CustomCodeNode):
+
+    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
+        self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float64) for _ in range(2))
+
+        symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
+        super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
+        self._time_step = time_step
+        self.headers = ['"philox_rand.h"']
+        self.keys = list(keys)
+        self._args = (time_step, *sp.sympify(keys))
+        self._dim = dim
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def undefined_symbols(self):
+        result = {a for a in self.args if isinstance(a, sp.Symbol)}
+        loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
+                         for i in range(self._dim)]
+        result.update(loop_counters)
+        return result
+
+    def get_code(self, dialect, vector_instruction_set):
+        parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
+                                          for i in range(self._dim)] + self.keys
+
+        while len(parameters) < 6:
+            parameters.append(0)
+        parameters = parameters[:6]
+
+        assert len(parameters) == 6
+
+        if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
+            return philox_two_doubles_call.format(parameters=', '.join(str(p) for p in parameters),
+                                                  result_symbols=self.result_symbols)
+        else:
+            raise NotImplementedError("Not yet implemented for this backend")
+
+    def __repr__(self):
+        return "{}, {} <- PhiloxRNG".format(*self.result_symbols)
+
+
+class PhiloxFourFloats(CustomCodeNode):
+
+    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
+        self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float32) for _ in range(4))
+        symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
+
+        super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
+        self._time_step = time_step
+        self.headers = ['"philox_rand.h"']
+        self.keys = list(keys)
+        self._args = (time_step, *sp.sympify(keys))
+        self._dim = dim
+
+    @property
+    def args(self):
+        return self._args
+
+    @property
+    def undefined_symbols(self):
+        result = {a for a in self.args if isinstance(a, sp.Symbol)}
+        loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
+                         for i in range(self._dim)]
+        result.update(loop_counters)
+        return result
+
+    def get_code(self, dialect, vector_instruction_set):
+        parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
+                                          for i in range(self._dim)] + self.keys
+
+        while len(parameters) < 6:
+            parameters.append(0)
+        parameters = parameters[:6]
+
+        assert len(parameters) == 6
+
+        if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
+            return philox_four_floats_call.format(parameters=', '.join(str(p) for p in parameters),
+                                                  result_symbols=self.result_symbols)
+        else:
+            raise NotImplementedError("Not yet implemented for this backend")
+
+    def __repr__(self):
+        return "{}, {}, {}, {} <- PhiloxRNG".format(*self.result_symbols)