diff --git a/alignedarray.py b/alignedarray.py
index 65807042ed6cccf6ec62751c118b818a972bfdfa..02a4f359956b109253e41a2c98a4c9020f4cd861 100644
--- a/alignedarray.py
+++ b/alignedarray.py
@@ -4,19 +4,22 @@ import numpy as np
 def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
     """
     Creates an aligned empty numpy array
-    :param shape: size of the array
-    :param byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
-    :param dtype: numpy data type
-    :param byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
-                       typically used to align first inner cell instead of ghost layer
-    :param order: storage linearization order
-    :param align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
-    :return:
+
+    Args:
+        shape: size of the array
+        byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
+        dtype: numpy data type
+        byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
+                    typically used to align first inner cell instead of ghost layer
+        order: storage linearization order
+        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
     """
     if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
         size = np.prod(shape)
         d = np.dtype(dtype)
-        tmp = np.empty(size * d.itemsize + byte_alignment, dtype=np.uint8)
+        # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
+        # vectorized loops can access vector_width elements further and don't require a tail loop
+        tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
         address = tmp.__array_interface__['data'][0]
         offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment
         return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order)
diff --git a/cpu/vectorization.py b/cpu/vectorization.py
index ecc4ebc1a6a0bcd09198d0223add9b4ccbaf62fb..8f8829061050467d0af0a1acf2e62d2da91de9ff 100644
--- a/cpu/vectorization.py
+++ b/cpu/vectorization.py
@@ -2,7 +2,7 @@ import sympy as sp
 import warnings
 from typing import Union, Container
 from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
-from pystencils.integer_functions import modulo_floor
+from pystencils.integer_functions import modulo_floor, modulo_ceil
 from pystencils.sympyextensions import fast_subs
 from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
     collate_types, PointerType
@@ -13,7 +13,7 @@ from pystencils.field import Field
 
 def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
               assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
-              assume_inner_stride_one: bool = False):
+              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
     """Explicit vectorization using SIMD vectorization via intrinsics.
 
     Args:
@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                  the inner loop stride is a runtime variable and thus might not be always 1.
                                  If this parameter is set to true, the the inner stride is assumed to be always one.
                                  This has to be ensured at runtime!
+        assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
+                                        extended by at most (vector_width-1) elements
+                                        assumes that at the end of each line there is enough padding with dummy data
+                                        depending on the access pattern there might be additional padding
+                                        required at the end of the array
     """
     all_fields = kernel_ast.fields_accessed
     if nontemporal is None or nontemporal is False:
@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
     vector_width = vector_is['width']
     kernel_ast.instruction_set = vector_is
 
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal)
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
+                                                nontemporal, assume_sufficient_line_padding)
     insert_vector_casts(kernel_ast)
 
 
-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields):
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
+                                                assume_sufficient_line_padding):
     """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
     all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
     inner_loops = [n for n in all_loops if n.is_innermost_loop]
@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
         loop_range = loop_node.stop - loop_node.start
 
         # cut off loop tail, that is not a multiple of four
-        cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-        loop_nodes = cut_loop(loop_node, [cutting_point])
-        assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
-        loop_node = loop_nodes[0]
+        if assume_aligned and assume_sufficient_line_padding:
+            loop_range = loop_node.stop - loop_node.start
+            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
+            loop_node.stop = new_stop
+        else:
+            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
+            loop_nodes = cut_loop(loop_node, [cutting_point])
+            assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            loop_node = loop_nodes[0]
         
         # Find all array accesses (indexed) that depend on the loop counter as offset
         loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
diff --git a/integer_functions.py b/integer_functions.py
index db8358ff2b23a88e94287ec313eecd9cce41f427..c60a5fb2c9bf1bc5a81ace3d95eb5c6293a0bc46 100644
--- a/integer_functions.py
+++ b/integer_functions.py
@@ -43,7 +43,7 @@ class modulo_floor(sp.Function):
 
 # noinspection PyPep8Naming
 class modulo_ceil(sp.Function):
-    """Returns the next smaller integer divisible by given divisor.
+    """Returns the next bigger integer divisible by given divisor.
 
     Examples:
         >>> modulo_ceil(9, 4)
@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function):
     def to_c(self, print_func):
         dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
         assert dtype.is_int()
-        code = "({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1})"
+        code = "(({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1}))"
         return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)