[cfe-dev] [OpenMP] Redundant store inside reduction loop body

Tue Dec 8 12:28:18 PST 2020

Hey,

I've encountered a peculiar code generation issue in a parallel-for-reduction.
Inside the per-thread reduction loop, I see a store to a stack slot that happens
*every iteration*, clobbering the value written by the previous one. The address
of that stack slot is later taken to pass to the inter-thread
reduction, but the store
is *not* hoisted outside the loop, while I'd expect it to happen just
once outside it.

Attached below are the code example that triggers this, and an annotated x86-64
assembly snippet from the outlined OMP function. The code was compiled using
clang++-12 Ubuntu focal unstable branch, using the command-line:
clang++-12 -fopenmp -O3 main.cpp -o main

I'm wondering whether this is some sort of bug, and in which component.

Regards,
~Itay

// main.cpp
#include <cstdio>
#include <memory>

double compute_dot_product(size_t n, double *xv, double *yv)
{
  double local = 0.0;
  #pragma omp parallel for reduction (+:local)
  for (size_t i = 0; i < n; i++) local += xv[i] * yv[i];
  return local;
}

int main(int argc, char **argv)
{
  constexpr size_t n = 0x1000;
  auto xv = std::make_unique<double[]>(n);
  auto yv = std::make_unique<double[]>(n);

  double result = compute_dot_product(n, xv.get(), yv.get());
  printf("result = %e\n", result);
  return 0;
}

// Disassembly excerpt from objdump -d --no-show-raw-insn main,
function <...>omp_outlined<...>
4012f0: movsd (%rdx,%rcx,8),%xmm1 ; <--- Non-unrolled loop head
4012f5: mulsd (%rsi,%rcx,8),%xmm1
4012fa: addsd %xmm1,%xmm0
4012fe: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration
401303: add $0x1,%rcx
401307: add $0xffffffffffffffff,%rax
40130b: jne 4012f0 <.omp_outlined.+0xc0>  ; <--- Non-unrolled loop latch
40130d: cmp $0x3,%rbp
401311: jb 40138b <.omp_outlined.+0x15b> ; <--- x4-unrolled loop guard
401313: sub %rcx,%rdi ; <--- x4-unrolled loop preheader
401316: lea (%rsi,%rcx,8),%rsi
40131a: add $0x18,%rsi
40131e: lea (%rdx,%rcx,8),%rcx
401322: add $0x18,%rcx
401326: mov $0xffffffffffffffff,%rdx
40132d: nopl (%rax)
401330: movsd -0x10(%rcx,%rdx,8),%xmm1 ; <--- x4-unrolled loop head
401336: mulsd -0x10(%rsi,%rdx,8),%xmm1
40133c: addsd %xmm0,%xmm1
401340: movsd %xmm1,(%rsp) ; <--- Weird store #1
401345: movsd -0x8(%rcx,%rdx,8),%xmm0
40134b: mulsd -0x8(%rsi,%rdx,8),%xmm0
401351: addsd %xmm1,%xmm0
401355: movsd %xmm0,(%rsp)  ; <--- Weird store #2
40135a: movsd (%rcx,%rdx,8),%xmm1
40135f: mulsd (%rsi,%rdx,8),%xmm1
401364: addsd %xmm0,%xmm1
401368: movsd %xmm1,(%rsp)  ; <--- Weird store #3
40136d: movsd 0x8(%rcx,%rdx,8),%xmm0
401373: mulsd 0x8(%rsi,%rdx,8),%xmm0
401379: addsd %xmm1,%xmm0
40137d: movsd %xmm0,(%rsp)  ; <--- Weird store #4
401382: add $0x4,%rdx
401386: cmp %rdx,%rdi
401389: jne 401330 <.omp_outlined.+0x100> ; <--- x4-unrolled loop latch
40138b: mov $0x402028,%edi
401390: mov %r14d,%esi
401393: callq 401040 <__kmpc_for_static_fini at plt>
401398: mov %rsp,%rax ; <--- Load address of the stack slot to pass to
reduction logic
40139b: mov %rax,0x20(%rsp)