diff --git a/backends/cbackend.py b/backends/cbackend.py
index b80170168bb76189cdb1d2008280e61b99eb8cb3..9becd107d649c5a01b215d52a5f6dc53ecd3a22a 100644
--- a/backends/cbackend.py
+++ b/backends/cbackend.py
@@ -176,7 +176,7 @@ class CBackend:
                 return "%s = %s;" % (self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs))
 
     def _print_TemporaryMemoryAllocation(self, node):
-        align = 128
+        align = 64
         np_dtype = node.symbol.dtype.base_type.numpy_dtype
         required_size = np_dtype.itemsize * node.size + align
         size = modulo_ceil(required_size, align)
@@ -188,7 +188,7 @@ class CBackend:
                            align=align)
 
     def _print_TemporaryMemoryFree(self, node):
-        align = 128
+        align = 64
         return "free(%s - %d);" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
 
     @staticmethod
diff --git a/datahandling/parallel_datahandling.py b/datahandling/parallel_datahandling.py
index 70caacb3c55ec16368756f2281df1a95bfdfb63f..6bdce83a61bf5476936ee504627d3d075777f44b 100644
--- a/datahandling/parallel_datahandling.py
+++ b/datahandling/parallel_datahandling.py
@@ -274,13 +274,13 @@ class ParallelDataHandling(DataHandling):
         for name in self._custom_data_transfer_functions.keys():
             self.to_gpu(name)
 
-    def synchronization_function_cpu(self, names, stencil=None, buffered=True, **_):
-        return self.synchronization_function(names, stencil, 'cpu', buffered, )
+    def synchronization_function_cpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
+        return self.synchronization_function(names, stencil, 'cpu', buffered, stencil_restricted)
 
-    def synchronization_function_gpu(self, names, stencil=None, buffered=True, **_):
-        return self.synchronization_function(names, stencil, 'gpu', buffered)
+    def synchronization_function_gpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
+        return self.synchronization_function(names, stencil, 'gpu', buffered, stencil_restricted)
 
-    def synchronization_function(self, names, stencil=None, target='cpu', buffered=True):
+    def synchronization_function(self, names, stencil=None, target='cpu', buffered=True, stencil_restricted=False):
         if target is None:
             target = self.default_target
 
@@ -293,6 +293,8 @@ class ParallelDataHandling(DataHandling):
         create_scheme = wlb.createUniformBufferedScheme if buffered else wlb.createUniformDirectScheme
         if target == 'cpu':
             create_packing = wlb.field.createPackInfo if buffered else wlb.field.createMPIDatatypeInfo
+            if not buffered and stencil_restricted:
+                create_packing = wlb.field.createStencilRestrictedPackInfo
         else:
             assert target == 'gpu'
             create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo