diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index 896a795cc2670179ca417d3ce05f88d05abb1cfe..44de5c3fb878cf1b34c7c311ce1cdc1da8daae8a 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -73,33 +73,22 @@ def generate_sweep(generation_context, class_name, assignments,
     env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
     add_pystencils_filters_to_jinja_env(env)
 
-    if inner_outer_split is False:
-        jinja_context = {
-            'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
-            'namespace': namespace,
-            'class_name': class_name,
-            'target': create_kernel_params.get("target", "cpu"),
-            'headers': get_headers(ast),
-            'ghost_layers_to_include': ghost_layers_to_include
-        }
-        header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
-        source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
-    else:
-        main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters)
-        representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter}
-        representative_field = sorted(representative_field)[0]
-
-        jinja_context = {
-            'kernel': main_kernel_info,
-            'namespace': namespace,
-            'class_name': class_name,
-            'target': create_kernel_params.get("target", "cpu"),
-            'field': representative_field,
-            'headers': get_headers(ast),
-            'ghost_layers_to_include': 0
-        }
-        header = env.get_template("SweepInnerOuter.tmpl.h").render(**jinja_context)
-        source = env.get_template("SweepInnerOuter.tmpl.cpp").render(**jinja_context)
+    main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters)
+    representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter}
+    representative_field = sorted(representative_field)[0]
+
+    jinja_context = {
+        'kernel': main_kernel_info,
+        'namespace': namespace,
+        'class_name': class_name,
+        'target': create_kernel_params.get("target", "cpu"),
+        'field': representative_field,
+        'headers': get_headers(ast),
+        'ghost_layers_to_include': ghost_layers_to_include,
+        'inner_outer_split': inner_outer_split
+    }
+    header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
+    source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
 
     source_extension = "cpp" if create_kernel_params.get("target", "cpu") == "cpu" else "cu"
     generation_context.write_file("{}.h".format(class_name), header)
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
index b26d9c6db71a12d941c329a3897b707067ae4892..9711be8cde6d7a94475c8172cb858c03631f60d4 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -83,6 +83,69 @@ void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage>
     {{kernel|generate_swaps|indent(4)}}
 }
 
+{%if inner_outer_split%}
+void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    CellInterval inner = {{field}}->xyzSize();
+    inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+    {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
+    {{kernel|generate_call(stream='stream', cell_interval='inner')|indent(4)}}
+}
+
+
+void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    if( layers_.size() == 0 )
+    {
+        CellInterval ci;
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+        layers_.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+        ci.expand(Cell(0, 0, -outerWidth_[2]));
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+        ci.expand(Cell(0, 0, -outerWidth_[2]));
+        layers_.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+        layers_.push_back(ci);
+    }
+
+    {%if target is equalto 'gpu'%}
+    {
+        auto parallelSection_ = parallelStreams_.parallelSection( stream );
+        for( auto & ci: layers_ )
+        {
+            parallelSection_.run([&]( auto s ) {
+                {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
+                {{kernel|generate_call(stream='s', cell_interval='ci')|indent(16)}}
+            });
+        }
+    }
+    {% else %}
+    for( auto & ci: layers_ )
+    {
+        {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(8) }}
+        {{kernel|generate_call(cell_interval='ci')|indent(8)}}
+    }
+    {% endif %}
+
+    {{kernel|generate_swaps|indent(4)}}
+}
+{% endif %}
 
 } // namespace {{namespace}}
 } // namespace walberla
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h
index b4db0f35bd6fc9ad5046c83bd99eca004291cc6f..9f0711a1e807553b2ca9cea2fa5dc5790c9c91af 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.h
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.h
@@ -24,6 +24,9 @@
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
 #include "cuda/GPUField.h"
+{% if inner_outer_split -%}
+#include "cuda/ParallelStreams.h"
+{%- endif %}
 {%- endif %}
 #include "field/SwapableCompare.h"
 #include "domain_decomposition/BlockDataID.h"
@@ -42,6 +45,7 @@
 #if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
 #   pragma GCC diagnostic push
 #   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
 #endif
 
 namespace walberla {
@@ -51,8 +55,8 @@ namespace {{namespace}} {
 class {{class_name}}
 {
 public:
-    {{class_name}}( {{kernel|generate_constructor_parameters}})
-        : {{ kernel|generate_constructor_initializer_list }}
+    {{class_name}}( {{kernel|generate_constructor_parameters}} {%if inner_outer_split%}, const Cell & outerWidth=Cell(1, 1, 1){% endif %})
+        : {{ kernel|generate_constructor_initializer_list }}{%if inner_outer_split%}{% if kernel|generate_constructor_initializer_list|length %},{% endif %} outerWidth_(outerWidth){% endif %}
     {};
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
@@ -79,7 +83,29 @@ public:
         };
     }
 
+{% if inner_outer_split %}
+
+    void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
+    void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
+
+    void setOuterPriority(int priority ) {
+       {%if target is equalto 'gpu'%}
+       parallelStreams_.setStreamPriority(priority);
+       {%endif%}
+    }
+    {{kernel|generate_members|indent(4)}}
+
+private:
+    {%if target is equalto 'gpu'%}
+    cuda::ParallelStreams parallelStreams_;
+    {% endif %}
+
+    Cell outerWidth_;
+    std::vector<CellInterval> layers_;
+
+{%- else -%}
     {{ kernel|generate_members|indent(4) }}
+{% endif -%}
 
 };
 
diff --git a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp
deleted file mode 100644
index 9f032a9ed79d54bf9c742fae64cf8a612bce2371..0000000000000000000000000000000000000000
--- a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \\file {{class_name}}.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
-//======================================================================================================================
-
-#include <cmath>
-
-#include "core/DataTypes.h"
-#include "core/Macros.h"
-#include "{{class_name}}.h"
-{% for header in headers %}
-#include {{header}}
-{% endfor %}
-
-
-{% if target is equalto 'cpu' -%}
-#define FUNC_PREFIX
-{%- elif target is equalto 'gpu' -%}
-#define FUNC_PREFIX __global__
-{%- endif %}
-
-#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
-#   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Wfloat-equal"
-#   pragma GCC diagnostic ignored "-Wshadow"
-#   pragma GCC diagnostic ignored "-Wconversion"
-#endif
-
-using namespace std;
-
-namespace walberla {
-namespace {{namespace}} {
-
-{{kernel|generate_definition(target)}}
-
-void {{class_name}}::operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
-{
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-    {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
-    {{kernel|generate_call(stream='stream')|indent(4)}}
-    {{kernel|generate_swaps|indent(4)}}
-}
-
-
-void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks,
-                                        const CellInterval & globalCellInterval,
-                                        cell_idx_t ghostLayers,
-                                        IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
-{
-    CellInterval ci = globalCellInterval;
-    CellInterval blockBB = blocks->getBlockCellBB( *block);
-    blockBB.expand( ghostLayers );
-    ci.intersect( blockBB );
-    blocks->transformGlobalToBlockLocalCellInterval( ci, *block );
-    if( ci.empty() )
-        return;
-
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-    {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
-    {{kernel|generate_call(stream='stream', cell_interval='ci')|indent(4)}}
-    {{kernel|generate_swaps|indent(4)}}
-}
-
-
-void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
-{
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-
-    CellInterval inner = {{field}}->xyzSize();
-    inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
-
-    {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
-    {{kernel|generate_call(stream='stream', cell_interval='inner')|indent(4)}}
-}
-
-
-void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} )
-{
-    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
-
-    if( layers_.size() == 0 )
-    {
-        CellInterval ci;
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
-        layers_.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
-        layers_.push_back(ci);
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
-        ci.expand(Cell(0, 0, -outerWidth_[2]));
-        layers_.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
-        ci.expand(Cell(0, 0, -outerWidth_[2]));
-        layers_.push_back(ci);
-
-        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
-        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
-        layers_.push_back(ci);
-        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
-        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
-        layers_.push_back(ci);
-    }
-
-    {%if target is equalto 'gpu'%}
-    {
-        auto parallelSection_ = parallelStreams_.parallelSection( stream );
-        for( auto & ci: layers_ )
-        {
-            parallelSection_.run([&]( auto s ) {
-                {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
-                {{kernel|generate_call(stream='s', cell_interval='ci')|indent(16)}}
-            });
-        }
-    }
-    {% else %}
-    for( auto & ci: layers_ )
-    {
-        {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(8) }}
-        {{kernel|generate_call(cell_interval='ci')|indent(8)}}
-    }
-    {% endif %}
-
-    {{kernel|generate_swaps|indent(4)}}
-}
-
-
-} // namespace {{namespace}}
-} // namespace walberla
-
-
-#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
-#   pragma GCC diagnostic pop
-#endif
diff --git a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h
deleted file mode 100644
index 6f6094d0e08dfe00e7873cfb204f636c81c0a04a..0000000000000000000000000000000000000000
--- a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h
+++ /dev/null
@@ -1,113 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \\file {{class_name}}.h
-//! \\author pystencils
-//======================================================================================================================
-
-#pragma once
-#include "core/DataTypes.h"
-
-{% if target is equalto 'cpu' -%}
-#include "field/GhostLayerField.h"
-{%- elif target is equalto 'gpu' -%}
-#include "cuda/GPUField.h"
-#include "cuda/ParallelStreams.h"
-{%- endif %}
-#include "field/SwapableCompare.h"
-#include "domain_decomposition/BlockDataID.h"
-#include "domain_decomposition/IBlock.h"
-#include "domain_decomposition/StructuredBlockStorage.h"
-#include <set>
-
-#ifdef __GNUC__
-#define RESTRICT __restrict__
-#elif _MSC_VER
-#define RESTRICT __restrict
-#else
-#define RESTRICT
-#endif
-
-#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
-#   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Wunused-parameter"
-#   pragma GCC diagnostic ignored "-Wreorder"
-#endif
-
-namespace walberla {
-namespace {{namespace}} {
-
-
-class {{class_name}}
-{
-public:
-    {{class_name}}( {{kernel|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
-        : {{ kernel|generate_constructor_initializer_list }}{% if kernel|generate_constructor_initializer_list|length %},{% endif %} outerWidth_(outerWidth)
-    {};
-
-    {{ kernel| generate_destructor(class_name) |indent(4) }}
-
-
-    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
-
-    void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
-                           const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block
-                           {%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %});
-
-
-
-    static std::function<void (IBlock*)> getSweep(const shared_ptr<{{class_name}}> & kernel) {
-        return [kernel](IBlock * b) { (*kernel)(b); };
-    }
-
-    static std::function<void (IBlock*{%if target is equalto 'gpu'%} , cudaStream_t {% endif %})>
-            getSweepOnCellInterval(const shared_ptr<{{class_name}}> & kernel,
-                                   const shared_ptr<StructuredBlockStorage> & blocks,
-                                   const CellInterval & globalCellInterval,
-                                   cell_idx_t ghostLayers=1 )
-    {
-        return [kernel, blocks, globalCellInterval, ghostLayers] (IBlock * b{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %}) {
-            kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b{%if target is equalto 'gpu'%}, stream {% endif %});
-        };
-    }
-
-
-    void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
-    void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} );
-
-    void setOuterPriority(int priority ) {
-        {%if target is equalto 'gpu'%}
-        parallelStreams_.setStreamPriority(priority);
-        {%endif%}
-    }
-    {{kernel|generate_members|indent(4)}}
-
-private:
-    {%if target is equalto 'gpu'%}
-    cuda::ParallelStreams parallelStreams_;
-    {% endif %}
-
-    Cell outerWidth_;
-    std::vector<CellInterval> layers_;
-};
-
-
-} // namespace {{namespace}}
-} // namespace walberla
-
-
-#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
-#   pragma GCC diagnostic pop
-#endif