diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index ab4f8f84e628577d0b2343744e74954b541d4ce0..4ac13a0656d3771a67f51ef627386986d5059428 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -59,6 +59,7 @@ from appdirs import user_cache_dir, user_config_dir from pystencils import FieldType from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.data_types import cast_func, VectorType from pystencils.include import get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update @@ -266,7 +267,6 @@ type_mapping = { np.complex128: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexDouble'), } - template_extract_scalar = """ PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}"); if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }}; @@ -357,7 +357,7 @@ def equal_size_check(fields): return template_size_check.format(cond=cond) -def create_function_boilerplate_code(parameter_info, name, insert_checks=True): +def create_function_boilerplate_code(parameter_info, name, ast_node, insert_checks=True): pre_call_code = "" parameters = [] post_call_code = "" @@ -375,6 +375,25 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True): np_dtype = field.dtype.numpy_dtype item_size = np_dtype.itemsize + aligned = False + if ast_node.assignments: + aligned = any([a.lhs.args[2] for a in ast_node.assignments + if hasattr(a, 'lhs') and isinstance(a.lhs, cast_func) + and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)]) + + if ast_node.instruction_set and aligned: + byte_width = ast_node.instruction_set['width'] * item_size + offset = max(max(ast_node.ghost_layers)) * item_size + offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0" + + message = str(offset) + ". This is probably due to a different number of ghost_layers chosen for " \ + "the arrays and the kernel creation. If the number of ghost layers for " \ + "the kernel creation is not specified it will choose a suitable value " \ + "automatically. This value might not " \ + "be compatible with the allocated arrays." + pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name, + expected=message) + if (np_dtype.isbuiltin and FieldType.is_generic(field) and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)): dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name, @@ -418,7 +437,7 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True): extract_function_imag=extract_function[1], target_type=target_type, real_type="float" if target_type == "ComplexFloat" - else "double", + else "double", name=param.symbol.name) else: pre_call_code += template_extract_scalar.format(extract_function=extract_function, @@ -481,12 +500,16 @@ class ExtensionModuleCode: self._ast_nodes = [] self._function_names = [] self._custom_backend = custom_backend + self._code_string = str() + self._code_hash = None def add_function(self, ast, name=None): self._ast_nodes.append(ast) self._function_names.append(name if name is not None else ast.function_name) - def write_to_file(self, restrict_qualifier, function_prefix, file): + def create_code_string(self, restrict_qualifier, function_prefix): + self._code_string = str() + headers = {'<math.h>', '<stdint.h>'} for ast in self._ast_nodes: headers.update(get_headers(ast)) @@ -495,19 +518,29 @@ class ExtensionModuleCode: header_list.insert(0, '"Python.h"') includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) - print(includes, file=file) - print("\n", file=file) - print(f"#define RESTRICT {restrict_qualifier}", file=file) - print(f"#define FUNC_PREFIX {function_prefix}", file=file) - print("\n", file=file) + self._code_string += includes + self._code_string += "\n" + self._code_string += f"#define RESTRICT {restrict_qualifier} \n" + self._code_string += f"#define FUNC_PREFIX {function_prefix}" + self._code_string += "\n" for ast, name in zip(self._ast_nodes, self._function_names): old_name = ast.function_name ast.function_name = "kernel_" + name - print(generate_c(ast, custom_backend=self._custom_backend), file=file) - print(create_function_boilerplate_code(ast.get_parameters(), name), file=file) + self._code_string += generate_c(ast, custom_backend=self._custom_backend) + self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast) ast.function_name = old_name - print(create_module_boilerplate_code(self.module_name, self._function_names), file=file) + + self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode()).hexdigest() + self._code_string += create_module_boilerplate_code(self._code_hash, self._function_names) + + def get_hash_of_code(self): + assert self._code_string, "The code must be generated first" + return self._code_hash + + def write_to_file(self, file): + assert self._code_string, "The code must be generated first" + print(self._code_string, file=file) def compile_module(code, code_hash, base_dir): @@ -515,12 +548,10 @@ def compile_module(code, code_hash, base_dir): extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] if compiler_config['os'].lower() == 'windows': - function_prefix = '__declspec(dllexport)' lib_suffix = '.pyd' object_suffix = '.obj' windows = True else: - function_prefix = '' lib_suffix = '.so' object_suffix = '.o' windows = False @@ -531,7 +562,7 @@ def compile_module(code, code_hash, base_dir): if not os.path.exists(object_file): with file_handle_for_atomic_write(src_file) as f: - code.write_to_file(compiler_config['restrict_qualifier'], function_prefix, f) + code.write_to_file(f) if windows: compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split() @@ -564,11 +595,16 @@ def compile_module(code, code_hash, base_dir): def compile_and_load(ast, custom_backend=None): cache_config = get_cache_config() - code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c', - custom_backend=custom_backend).encode()).hexdigest() - code = ExtensionModuleCode(module_name=code_hash_str, custom_backend=custom_backend) + + compiler_config = get_compiler_config() + function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else '' + + code = ExtensionModuleCode(custom_backend=custom_backend) code.add_function(ast, ast.function_name) + code.create_code_string(compiler_config['restrict_qualifier'], function_prefix) + code_hash_str = code.get_hash_of_code() + if cache_config['object_cache'] is False: with TemporaryDirectory() as base_dir: lib_file = compile_module(code, code_hash_str, base_dir) diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index 5f22a230d501f1df26d605beba6fad5d1d251971..0ef55bdf4016f6ada53270f68c05e0c3f0210ea7 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -13,7 +13,7 @@ from pystencils.cpu.vectorization import vec_all, vec_any def test_vec_any(): data_arr = np.zeros((15, 15)) - data_arr[3:9, 2:7] = 1.0 + data_arr[3:9, 1] = 1.0 data = ps.fields("data: double[2D]", data=data_arr) c = [ @@ -22,11 +22,15 @@ def test_vec_any(): ps.Assignment(data.center(), 2.0) ])) ] + instruction_set = get_supported_instruction_sets()[-1] ast = ps.create_kernel(c, target='cpu', - cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) + cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) - np.testing.assert_equal(data_arr[3:9, 0:8], 2.0) + + width = ast.instruction_set['width'] + + np.testing.assert_equal(data_arr[3:9, 0:width], 2.0) @pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 9d9d4ac50ff2160ad2823a01cf448a4f17205fe6..4476e5bf4f68b82056ecb3242237b1c3ce879191 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -4,7 +4,8 @@ import numpy as np import sympy as sp import pystencils as ps -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set +from pystencils.data_types import cast_func, VectorType supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] @@ -49,3 +50,30 @@ def test_vectorized_abs(instruction_set, dtype): dst = np.zeros_like(arr) func(g=dst, f=arr) np.testing.assert_equal(np.sum(dst[1:-1, 1:-1]), 2 ** 2 * 2 ** 3) + + +@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('gl_field, gl_kernel', [(1, 0), (0, 1), (1, 1)]) +def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set, dtype): + itemsize = 8 if dtype == 'double' else 4 + alignment = get_vector_instruction_set(dtype, instruction_set)['width'] * itemsize + dtype = np.float64 if dtype == 'double' else np.float32 + + domain_size = (128, 128) + dh = ps.create_data_handling(domain_size, periodicity=(True, True), default_target='cpu') + src = dh.add_array("src", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=alignment) + dh.fill(src.name, 1.0, ghost_layers=True) + dst = dh.add_array("dst", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=alignment) + dh.fill(dst.name, 1.0, ghost_layers=True) + + update_rule = ps.Assignment(dst[0, 0], src[0, 0]) + opt = {'instruction_set': instruction_set, 'assume_aligned': True, + 'nontemporal': True, 'assume_inner_stride_one': True} + ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, ghost_layers=gl_kernel) + kernel = ast.compile() + if gl_kernel != gl_field: + with pytest.raises(ValueError): + dh.run_kernel(kernel) + else: + dh.run_kernel(kernel) \ No newline at end of file