diff --git a/CMakeLists.txt b/CMakeLists.txt index bdaf0ea9ac8c7531815f8f27bad3943eec566191..f3ff04c9e922cf9fbc6035f3353a09ba85205a25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,12 +357,23 @@ endif() # architecture optimization if( WALBERLA_OPTIMIZE_FOR_LOCALHOST ) if( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL OR WALBERLA_CXX_COMPILER_IS_CLANG ) - add_flag ( CMAKE_CXX_FLAGS "-march=native" ) - add_flag ( CMAKE_C_FLAGS "-march=native" ) + if( CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64" ) + # no -march=native available on this compiler, but there is currently only one such processor + else() + add_flag ( CMAKE_CXX_FLAGS "-march=native" ) + add_flag ( CMAKE_C_FLAGS "-march=native" ) + endif() + if( WALBERLA_CXX_COMPILER_IS_INTEL ) add_flag ( CMAKE_CXX_FLAGS "-xhost" ) add_flag ( CMAKE_C_FLAGS "-xhost" ) endif() + + if( EXISTS "/proc/sys/abi/sve_default_vector_length" ) + file( READ "/proc/sys/abi/sve_default_vector_length" SVE_LENGTH ) + add_flag ( CMAKE_CXX_FLAGS "-msve-vector-bits=${SVE_LENGTH}" ) + add_flag ( CMAKE_C_FLAGS "-msve-vector-bits=${SVE_LENGTH}" ) + endif() endif() endif() diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index 44de5c3fb878cf1b34c7c311ce1cdc1da8daae8a..f1a84798f2af17c254febf5e4c327964db824d91 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -61,6 +61,8 @@ def generate_sweep(generation_context, class_name, assignments, else: ast = create_staggered_kernel(assignments, **create_kernel_params) ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] + ast.nontemporal = create_kernel_params['cpu_vectorize_info']['nontemporal'] + ast.openmp = create_kernel_params['cpu_openmp'] def to_name(f): return f.name if isinstance(f, Field) else f diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index 7158f81685b5382f7516299af574fd0d374be47a..ad6d317878ed6ce650ad81c7223f6d744cba79a5 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -238,37 +238,50 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non if param.is_field_pointer: field = param.fields[0] if field.field_type == FieldType.BUFFER: - kernel_call_lines.append("%s %s = %s;" % (param.symbol.dtype, param.symbol.name, param.field_name)) + kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name};") else: coordinates = get_start_coordinates(field) - actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, ) + actual_gls = f"int_c({param.field_name}->nrOfGhostLayers())" coord_set = set(coordinates) coord_set = sorted(coord_set, key=lambda e: str(e)) for c in coord_set: - kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, -%s);" % - (c, actual_gls)) + kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({c}, -{actual_gls});") while len(coordinates) < 4: coordinates.append(0) coordinates = tuple(coordinates) - kernel_call_lines.append("%s %s = %s->dataAt(%s, %s, %s, %s);" % - ((param.symbol.dtype, param.symbol.name, param.field_name) + coordinates)) + kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name}->dataAt" + f"({coordinates[0]}, {coordinates[1]}, {coordinates[2]}, {coordinates[3]});") if ast.assumed_inner_stride_one and field.index_dimensions > 0: - kernel_call_lines.append("WALBERLA_ASSERT_EQUAL(%s->layout(), field::fzyx);" % (param.field_name,)) + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx);") + if ast.instruction_set and ast.assumed_inner_stride_one: + if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set: + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" + f"{ast.instruction_set['cachelineSize']}, 0);") + else: + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" + f"{ast.instruction_set['bytes']}, 0);") elif param.is_field_stride: casted_stride = get_field_stride(param) type_str = param.symbol.dtype.base_name - kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, casted_stride)) + kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {casted_stride};") elif param.is_field_shape: coord = param.symbol.coordinate field = param.fields[0] type_str = param.symbol.dtype.base_name - shape = "%s(%s)" % (type_str, get_end_coordinates(field)[coord]) + shape = f"{type_str}({get_end_coordinates(field)[coord]})" assert coord < 3 - max_value = "%s->%sSizeWithGhostLayer()" % (field.name, ('x', 'y', 'z')[coord]) - kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape)) - kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, shape)) + max_value = f"{field.name}->{('x', 'y', 'z')[coord]}SizeWithGhostLayer()" + kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape});") + kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {shape};") if ast.assumed_inner_stride_one and field.index_dimensions > 0: - kernel_call_lines.append("WALBERLA_ASSERT_EQUAL(%s->layout(), field::fzyx);" % (field.name,)) + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx);") + if ast.instruction_set and ast.assumed_inner_stride_one: + if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set: + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" + f"{ast.instruction_set['cachelineSize']}, 0);") + else: + kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" + f"{ast.instruction_set['bytes']}, 0);") call_parameters = ", ".join([p.symbol.name for p in ast_params]) diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h index a70bdf813d071293c4c20674ac307a4221db66c5..53e689c1186aa3643b6adda22684863e5da9be6d 100644 --- a/src/field/Field.impl.h +++ b/src/field/Field.impl.h @@ -316,7 +316,13 @@ namespace field { // Automatically select allocator if none was given if ( alloc == nullptr ) { -#ifdef __BIGGEST_ALIGNMENT__ +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 + const uint_t alignment = __ARM_FEATURE_SVE_BITS/8; +#elif defined(__ARM_FEATURE_SVE) + const uint_t alignment = 64; +#elif defined(__ARM_NEON) + const uint_t alignment = 16; +#elif defined(__BIGGEST_ALIGNMENT__) const uint_t alignment = __BIGGEST_ALIGNMENT__; #elif defined(__AVX512F__) const uint_t alignment = 64;