diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 00fe25c787035ca00ce4fb9a890046786ff4463e..99ce2d6fbfbb31f87fdc90aec22fa203386935fe 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -134,11 +134,22 @@ def create_folder(path, is_file):
         pass
 
 
+def get_llc_command():
+    """Try to get executable for llvm's IR compiler llc
+
+    We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6
+    """
+    candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6']
+    found_executables = (e for e in candidates if shutil.which(e))
+    return next(found_executables, None)
+
+
 def read_config():
     if platform.system().lower() == 'linux':
         default_compiler_config = OrderedDict([
             ('os', 'linux'),
             ('command', 'g++'),
+            ('llc_command', get_llc_command() or 'llc'),
             ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
             ('restrict_qualifier', '__restrict__')
         ])
@@ -146,6 +157,7 @@ def read_config():
         default_compiler_config = OrderedDict([
             ('os', 'windows'),
             ('msvc_version', 'latest'),
+            ('llc_command', get_llc_command() or 'llc'),
             ('arch', 'x64'),
             ('flags', '/Ox /fp:fast /openmp /arch:avx'),
             ('restrict_qualifier', '__restrict')
@@ -154,6 +166,7 @@ def read_config():
         default_compiler_config = OrderedDict([
             ('os', 'darwin'),
             ('command', 'clang++'),
+            ('llc_command', get_llc_command() or 'llc'),
             ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
             ('restrict_qualifier', '__restrict__')
         ])
diff --git a/pystencils/data_types.py b/pystencils/data_types.py
index faa8867188b36ae3c3f355b09b2dc41d99a98086..fc7f8a4f0e916922297a7314d432627503ba8464 100644
--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
@@ -331,7 +331,7 @@ def ctypes_from_llvm(data_type):
         raise NotImplementedError('Data type %s of %s is not supported yet' % (type(data_type), data_type))
 
 
-def to_llvm_type(data_type):
+def to_llvm_type(data_type, nvvm_target=False):
     """
     Transforms a given type into ctypes
     :param data_type: Subclass of Type
@@ -340,7 +340,7 @@ def to_llvm_type(data_type):
     if not ir:
         raise _ir_importerror
     if isinstance(data_type, PointerType):
-        return to_llvm_type(data_type.base_type).as_pointer()
+        return to_llvm_type(data_type.base_type).as_pointer(1 if nvvm_target else 0)
     else:
         return to_llvm_type.map[data_type.numpy_dtype]
 
diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py
index 4c8701b2599dd9b4570c7d7f8875e0e6b58bef36..eb212119ac8505ac5bf4db3112ef645b8e793f80 100644
--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
@@ -24,10 +24,10 @@ class ThreadIndexingSymbol(TypedSymbol):
     __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
 
 
-BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
+BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
 
 
 class AbstractIndexing(abc.ABC):
diff --git a/pystencils/llvm/kernelcreation.py b/pystencils/llvm/kernelcreation.py
index 38ac7fe6be818729eae3a935a4c14003f066e849..57e5b73876b643196f5529d05df41a11a1ef01e5 100644
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
@@ -3,7 +3,7 @@ from pystencils.transformations import insert_casts
 
 
 def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
-                  iteration_slice=None, ghost_layers=None):
+                  iteration_slice=None, ghost_layers=None, target='cpu'):
     """
     Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
 
@@ -25,9 +25,21 @@ def create_kernel(assignments, function_name="kernel", type_info=None, split_gro
 
     :return: :class:`pystencils.ast.KernelFunction` node
     """
-    from pystencils.cpu import create_kernel
-    code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
+    if target == 'cpu':
+        from pystencils.cpu import create_kernel
+        code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
+        code._backend = 'llvm'
+    elif target == 'gpu':
+        from pystencils.gpucuda.kernelcreation import create_cuda_kernel
+        code = create_cuda_kernel(assignments,
+                                  function_name,
+                                  type_info,
+                                  iteration_slice=iteration_slice,
+                                  ghost_layers=ghost_layers)
+        code._backend = 'llvm_gpu'
+    else:
+        NotImplementedError()
     code.body = insert_casts(code.body)
     code._compile_function = make_python_function
-    code._backend = 'llvm'
+
     return code
diff --git a/pystencils/llvm/llvm.py b/pystencils/llvm/llvm.py
index cf2207c0831f0d365033dc25ee4c3c701af9940e..1d5223e9509f274e002d755fa19a1a56f9ec016e 100644
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
@@ -1,6 +1,7 @@
 import functools
 
 import llvmlite.ir as ir
+import llvmlite.llvmpy.core as lc
 import sympy as sp
 from sympy import Indexed, S
 from sympy.printing.printer import Printer
@@ -12,13 +13,39 @@ from pystencils.data_types import (
 from pystencils.llvm.control_flow import Loop
 
 
-def generate_llvm(ast_node, module=None, builder=None):
+# From Numba
+def set_cuda_kernel(lfunc):
+    from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
+
+    m = lfunc.module
+
+    ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
+    md = MetaData.get(m, ops)
+
+    nmd = m.get_or_insert_named_metadata('nvvm.annotations')
+    nmd.add(md)
+
+    # set nvvm ir version
+    i32 = ir.IntType(32)
+    md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
+    m.add_named_metadata('nvvmir.version', md_ver)
+
+
+# From Numba
+def _call_sreg(builder, name):
+    module = builder.module
+    fnty = lc.Type.function(lc.Type.int(), ())
+    fn = module.get_or_insert_function(fnty, name=name)
+    return builder.call(fn, ())
+
+
+def generate_llvm(ast_node, module=None, builder=None, target='cpu'):
     """Prints the ast as llvm code."""
     if module is None:
-        module = ir.Module()
+        module = lc.Module()
     if builder is None:
         builder = ir.IRBuilder()
-    printer = LLVMPrinter(module, builder)
+    printer = LLVMPrinter(module, builder, target=target)
     return printer._print(ast_node)
 
 
@@ -26,7 +53,7 @@ def generate_llvm(ast_node, module=None, builder=None):
 class LLVMPrinter(Printer):
     """Convert expressions to LLVM IR"""
 
-    def __init__(self, module, builder, fn=None, *args, **kwargs):
+    def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs):
         self.func_arg_map = kwargs.pop("func_arg_map", {})
         super(LLVMPrinter, self).__init__(*args, **kwargs)
         self.fp_type = ir.DoubleType()
@@ -39,6 +66,7 @@ class LLVMPrinter(Printer):
         self.fn = fn
         self.ext_fn = {}  # keep track of wrappers to external functions
         self.tmp_var = {}
+        self.target = target
 
     def _add_tmp_var(self, name, value):
         self.tmp_var[name] = value
@@ -163,7 +191,7 @@ class LLVMPrinter(Printer):
         parameter_type = []
         parameters = func.get_parameters()
         for parameter in parameters:
-            parameter_type.append(to_llvm_type(parameter.symbol.dtype))
+            parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu'))
         func_type = ir.FunctionType(return_type, tuple(parameter_type))
         name = func.function_name
         fn = ir.Function(self.module, func_type, name)
@@ -181,6 +209,9 @@ class LLVMPrinter(Printer):
         self._print(func.body)
         self.builder.ret_void()
         self.fn = fn
+        if self.target == 'gpu':
+            set_cuda_kernel(fn)
+
         return fn
 
     def _print_Block(self, block):
@@ -217,8 +248,10 @@ class LLVMPrinter(Printer):
 
         # (From, to)
         decision = {
+            (create_composite_type_from_string("int32"),
+             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
             (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
+             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
             (create_composite_type_from_string("int"),
              create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
             (create_composite_type_from_string("int16"),
@@ -295,13 +328,21 @@ class LLVMPrinter(Printer):
                     self.builder.branch(after_block)
                     self.builder.position_at_end(false_block)
 
-            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
+            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu'))
             for (val, block) in phi_data:
                 phi.add_incoming(val, block)
             return phi
 
-    # Should have a list of math library functions to validate this.
-    # TODO function calls to libs
+    def _print_Conditional(self, node):
+        cond = self._print(node.condition_expr)
+        with self.builder.if_else(cond) as (then, otherwise):
+            with then:
+                self._print(node.true_block)       # emit instructions for when the predicate is true
+            with otherwise:
+                self._print(node.false_block)       # emit instructions for when the predicate is true
+
+        # No return!
+
     def _print_Function(self, expr):
         name = expr.func.__name__
         e0 = self._print(expr.args[0])
@@ -320,3 +361,19 @@ class LLVMPrinter(Printer):
             mro = "None"
         raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
                         % (expr, type(expr), mro))
+
+    # from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
+    INDEXING_FUNCTION_MAPPING = {
+        'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
+        'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
+        'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
+        'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
+    }
+
+    def _print_ThreadIndexingSymbol(self, node):
+        symbol_name: str = node.name
+        function_name, dimension = tuple(symbol_name.split("."))
+        function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
+        name = f"{function_name}.{dimension}"
+
+        return self.builder.zext(_call_sreg(self.builder, name), self.integer)
diff --git a/pystencils/llvm/llvmjit.py b/pystencils/llvm/llvmjit.py
index 6dc40151e4495d5fa67f9c6081a442a558b1d09a..0baec5452f80e6d8ca3c1b1d760ef23e510dcb0e 100644
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
@@ -1,4 +1,8 @@
 import ctypes as ct
+import subprocess
+from functools import partial
+from itertools import chain
+from os.path import exists, join
 
 import llvmlite.binding as llvm
 import llvmlite.ir as ir
@@ -98,11 +102,12 @@ def make_python_function_incomplete_params(kernel_function_node, argument_dict,
 
 
 def generate_and_jit(ast):
-    gen = generate_llvm(ast)
+    target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
+    gen = generate_llvm(ast, target=target)
     if isinstance(gen, ir.Module):
-        return compile_llvm(gen)
+        return compile_llvm(gen, target, ast)
     else:
-        return compile_llvm(gen.module)
+        return compile_llvm(gen.module, target, ast)
 
 
 def make_python_function(ast, argument_dict={}, func=None):
@@ -117,8 +122,8 @@ def make_python_function(ast, argument_dict={}, func=None):
     return lambda: func(*args)
 
 
-def compile_llvm(module):
-    jit = Jit()
+def compile_llvm(module, target='cpu', ast=None):
+    jit = CudaJit(ast) if target == "gpu" else Jit()
     jit.parse(module)
     jit.optimize()
     jit.compile()
@@ -224,3 +229,95 @@ class Jit(object):
         fptr = self.fptr[name]
         fptr.jit = self
         return fptr
+
+
+# Following code more or less from numba
+class CudaJit(Jit):
+
+    CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
+                   64: 'nvptx64-nvidia-cuda'}
+    MACHINE_BITS = tuple.__itemsize__ * 8
+    data_layout = {
+        32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
+             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
+        64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
+             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
+
+    default_data_layout = data_layout[MACHINE_BITS]
+
+    def __init__(self, ast):
+        # super().__init__()
+
+        # self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
+        self._data_layout = self.default_data_layout[self.MACHINE_BITS]
+        # self._target_data = llvm.create_target_data(self._data_layout)
+        self.indexing = ast.indexing
+
+    def optimize(self):
+        pmb = llvm.create_pass_manager_builder()
+        pmb.opt_level = 2
+        pmb.disable_unit_at_a_time = False
+        pmb.loop_vectorize = False
+        pmb.slp_vectorize = False
+        # TODO possible to pass for functions
+        pm = llvm.create_module_pass_manager()
+        pm.add_instruction_combining_pass()
+        pm.add_function_attrs_pass()
+        pm.add_constant_merge_pass()
+        pm.add_licm_pass()
+        pmb.populate(pm)
+        pm.run(self.llvmmod)
+        pm.run(self.llvmmod)
+
+    def write_ll(self, file):
+        with open(file, 'w') as f:
+            f.write(str(self.llvmmod))
+
+    def parse(self, module):
+
+        llvmmod = module
+        llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
+        llvmmod.data_layout = self.default_data_layout
+        llvmmod.verify()
+        llvmmod.name = 'module'
+
+        self._llvmmod = llvm.parse_assembly(str(llvmmod))
+
+    def compile(self):
+        from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
+        import hashlib
+        compiler_cache = get_cache_config()['object_cache']
+        ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
+        ptx_file = ir_file.replace('.ll', '.ptx')
+        try:
+            from pycuda.driver import Context
+            arch = "sm_%d%d" % Context.get_device().compute_capability()
+        except Exception:
+            arch = "sm_35"
+
+        if not exists(ptx_file):
+            self.write_ll(ir_file)
+            if 'llc' in get_compiler_config():
+                llc_command = get_compiler_config()['llc']
+            else:
+                llc_command = get_llc_command() or 'llc'
+
+            subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
+
+        # cubin_file = ir_file.replace('.ll', '.cubin')
+        # if not exists(cubin_file):
+            # subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
+        import pycuda.driver
+
+        cuda_module = pycuda.driver.module_from_file(ptx_file)  # also works: cubin_file
+        self.cuda_module = cuda_module
+
+    def __call__(self, func, *args, **kwargs):
+        shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
+        block_and_thread_numbers = self.indexing.call_parameters(shape)
+        block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+        block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+        self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
+
+    def get_function_ptr(self, name):
+        return partial(self._call__, name)
diff --git a/pystencils_tests/test_jacobi_llvm.py b/pystencils_tests/test_jacobi_llvm.py
index 2965cf245c6bf34e001d1ac67ad5b350c1219247..13b22aa4b0235bb98b4789e3b9266bf672985111 100644
--- a/pystencils_tests/test_jacobi_llvm.py
+++ b/pystencils_tests/test_jacobi_llvm.py
@@ -1,6 +1,8 @@
 import numpy as np
+import pytest
 
-from pystencils import Assignment, Field
+from pystencils import Assignment, Field, show_code
+from pystencils.cpu.cpujit import get_llc_command
 from pystencils.llvm import create_kernel, make_python_function
 from pystencils.llvm.llvmjit import generate_and_jit
 
@@ -30,6 +32,39 @@ def test_jacobi_fixed_field_size():
     np.testing.assert_almost_equal(error, 0.0)
 
 
+@pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH")
+def test_jacobi_fixed_field_size_gpu():
+    size = (30, 20)
+
+    import pycuda.autoinit  # noqa
+    from pycuda.gpuarray import to_gpu
+
+    src_field_llvm = np.random.rand(*size)
+    src_field_py = np.copy(src_field_llvm)
+    dst_field_llvm = np.zeros(size)
+    dst_field_py = np.zeros(size)
+
+    f = Field.create_from_numpy_array("f", src_field_py)
+    d = Field.create_from_numpy_array("d", dst_field_py)
+
+    src_field_llvm = to_gpu(src_field_llvm)
+    dst_field_llvm = to_gpu(dst_field_llvm)
+
+    jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
+    ast = create_kernel([jacobi], target='gpu')
+    print(show_code(ast))
+
+    for x in range(1, size[0] - 1):
+        for y in range(1, size[1] - 1):
+            dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] +
+                                         src_field_py[x, y - 1] + src_field_py[x, y + 1])
+
+    jit = generate_and_jit(ast)
+    jit('kernel', dst_field_llvm, src_field_llvm)
+    error = np.sum(np.abs(dst_field_py - dst_field_llvm.get()))
+    np.testing.assert_almost_equal(error, 0.0)
+
+
 def test_jacobi_variable_field_size():
     size = (3, 3, 3)
     f = Field.create_generic("f", 3)
@@ -52,3 +87,7 @@ def test_jacobi_variable_field_size():
     kernel()
     error = np.sum(np.abs(dst_field_py - dst_field_llvm))
     np.testing.assert_almost_equal(error, 0.0)
+
+
+if __name__ == "__main__":
+    test_jacobi_fixed_field_size_gpu()