Invalid Code Generator Output when applying OpenMP and vectorization to the same reduction loop
(At the time of writing, this error never occurs as #127 (closed) stops OpenMP pragmas from being inserted)
This is the would-be IR of a 1D add-reduction kernel with OpenMP and AVX512 vectorization:
Click to expand
{
w_local: float64 = [0.0: float64];
{
w_local__1: __m512d = _mm512_set_pd([0.0: float64], [0.0: float64], [0.0: float64], [0.0: float64], [0.0: float64], [0.0: float64], [0.0: float64], [0.0: float64]);
__ctr_0_simd_stop: const int64 = _size_x_0 - [7: int64];
__ctr_0_simd_step: const int64 = [8: int64];
#pragma omp parallel for schedule(static) reduction(+: w_local)
for(ctr_0: int64 = [0: int64]; ctr_0 < __ctr_0_simd_stop; ctr_0 += __ctr_0_simd_step)
{
w_local__1 = _mm512_add_pd(w_local__1, _mm512_loadu_pd(&_data_x[ctr_0]));
}
w_local = _mm512_horizontal_add_pd(w_local, w_local__1);
__ctr_0_trailing_start: const int64 = __ctr_0_simd_stop > [0: int64] ? ((__ctr_0_simd_stop - [1: int64]) / __ctr_0_simd_step + [1: int64]) * __ctr_0_simd_step : [0: int64];
#pragma omp parallel for schedule(static) reduction(+: w_local)
for(ctr_0__1: int64 = __ctr_0_trailing_start; ctr_0__1 < _size_x_0; ctr_0__1 += [1: int64])
{
w_local = w_local + _data_x[ctr_0__1];
}
}
w[[0: int64]] = w[[0: int64]] + w_local;
}
The OpenMP directive above the vectorized loop lists reduction(+: w_local)
, but that variable is never updated in the loop body.
Its vectorized cousin, w_local__1
, is not registered at OpenMP and as such is implicitly shared, which may lead to race conditions on writes.
It is not possible to set a reduction()
clause for a variable of vector-type.
This is only an issue if the same loop is OpenMP-instrumented and vectorized. In a larger loop nest, where the outer loop is parallelized and the inner loop is vectorized, all is valid:
Click to expand
void kernel (double * RESTRICT const _data_f, double * RESTRICT const _data_u, const int64 _size_f_0, const int64 _size_f_1, const int64 _stride_f_0, const int64 _stride_u_0, const float64 h, double * const residual_acc)
{
residual_acc_local: float64 = 0.0;
#pragma omp parallel for schedule(static) reduction(+: residual_acc_local)
for(ctr_0: int64 = 1; ctr_0 < _size_f_0 - 1; ctr_0 += 1)
{
{
residual_acc_local__1: __m512d = _mm512_set_pd(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
__ctr_1_simd_stop: const int64 = _size_f_1 - 1 - 7;
__ctr_1_simd_step: const int64 = 8;
for(ctr_1: int64 = 1; ctr_1 < __ctr_1_simd_stop; ctr_1 += __ctr_1_simd_step)
{
residual_acc_local__1 = _mm512_add_pd(residual_acc_local__1, _mm512_add_pd(_mm512_mul_pd(_mm512_set1_pd(-1.0), _mm512_loadu_pd(&_data_f[ctr_0 * _stride_f_0 + ctr_1])), _mm512_mul_pd(_mm512_div_pd(_mm512_set1_pd(1.0), _mm512_mul_pd(_mm512_set1_pd(h), _mm512_set1_pd(h))), _mm512_add_pd(_mm512_sub_pd(_mm512_sub_pd(_mm512_sub_pd(_mm512_mul_pd(_mm512_set1_pd(-1.0), _mm512_loadu_pd(&_data_u[(ctr_0 + 1) * _stride_u_0 + ctr_1])), _mm512_loadu_pd(&_data_u[ctr_0 * _stride_u_0 + (ctr_1 + 1)])), _mm512_loadu_pd(&_data_u[ctr_0 * _stride_u_0 + (ctr_1 + -1)])), _mm512_loadu_pd(&_data_u[(ctr_0 + -1) * _stride_u_0 + ctr_1])), _mm512_mul_pd(_mm512_set1_pd(4.0), _mm512_loadu_pd(&_data_u[ctr_0 * _stride_u_0 + ctr_1]))))));
}
residual_acc_local = _mm512_horizontal_add_pd(residual_acc_local, residual_acc_local__1);
__ctr_1_trailing_start: const int64 = __ctr_1_simd_stop > 1 ? ((__ctr_1_simd_stop - 1 - 1) / __ctr_1_simd_step + 1) * __ctr_1_simd_step + 1 : 1;
for(ctr_1__1: int64 = __ctr_1_trailing_start; ctr_1__1 < _size_f_1 - 1; ctr_1__1 += 1)
{
residual_acc_local = residual_acc_local + (-1.0 * _data_f[ctr_0 * _stride_f_0 + ctr_1__1] + 1.0 / (h * h) * (-1.0 * _data_u[(ctr_0 + 1) * _stride_u_0 + ctr_1__1] - _data_u[ctr_0 * _stride_u_0 + (ctr_1__1 + 1)] - _data_u[ctr_0 * _stride_u_0 + (ctr_1__1 + -1)] - _data_u[(ctr_0 + -1) * _stride_u_0 + ctr_1__1] + 4.0 * _data_u[ctr_0 * _stride_u_0 + ctr_1__1]));
}
}
}
residual_acc[0] = residual_acc[0] + residual_acc_local;
}