[llvm-bugs] [Bug 48475] New: Redundant stores inside reduction loop bodies

Thu Dec 10 12:48:11 PST 2020

https://bugs.llvm.org/show_bug.cgi?id=48475

            Bug ID: 48475
           Summary: Redundant stores inside reduction loop bodies
           Product: OpenMP
           Version: unspecified
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Clang Compiler Support
          Assignee: unassignedclangbugs at nondot.org
          Reporter: itay.bookstein at nextsilicon.com
                CC: llvm-bugs at lists.llvm.org

The following sample generates code where the per-thread reduction loop
performs a store to a stack slot *every iteration*, rather than once after the
entire loop. The address of the stack slot is later taken to pass to the
inter-thread reduction, which is probably what causes this optimization
opportunity to elude the escape analysis.

The code was compiled using clang++-12 Ubuntu focal unstable branch, using the
command-line:
clang++-12 -fopenmp -O3 main.cpp -o main

// main.cpp
#include <cstdio>
#include <memory>

double compute_dot_product(size_t n, double *xv, double *yv)
{
  double local = 0.0;
  #pragma omp parallel for reduction (+:local)
  for (size_t i = 0; i < n; i++) local += xv[i] * yv[i];
  return local;
}

int main(int argc, char **argv)
{
  constexpr size_t n = 0x1000;
  auto xv = std::make_unique<double[]>(n);
  auto yv = std::make_unique<double[]>(n);

  double result = compute_dot_product(n, xv.get(), yv.get());
  printf("result = %e\n", result);
  return 0;
}

// Disassembly excerpt from objdump -d --no-show-raw-insn main,
function <...>omp_outlined<...>
4012f0: movsd (%rdx,%rcx,8),%xmm1 ; <--- Non-unrolled loop head
4012f5: mulsd (%rsi,%rcx,8),%xmm1
4012fa: addsd %xmm1,%xmm0
4012fe: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration
401303: add $0x1,%rcx
401307: add $0xffffffffffffffff,%rax
40130b: jne 4012f0 <.omp_outlined.+0xc0>  ; <--- Non-unrolled loop latch
40130d: cmp $0x3,%rbp
401311: jb 40138b <.omp_outlined.+0x15b> ; <--- x4-unrolled loop guard
401313: sub %rcx,%rdi ; <--- x4-unrolled loop preheader
401316: lea (%rsi,%rcx,8),%rsi
40131a: add $0x18,%rsi
40131e: lea (%rdx,%rcx,8),%rcx
401322: add $0x18,%rcx
401326: mov $0xffffffffffffffff,%rdx
40132d: nopl (%rax)
401330: movsd -0x10(%rcx,%rdx,8),%xmm1 ; <--- x4-unrolled loop head
401336: mulsd -0x10(%rsi,%rdx,8),%xmm1
40133c: addsd %xmm0,%xmm1
401340: movsd %xmm1,(%rsp) ; <--- Un-hoisted store every loop iteration
401345: movsd -0x8(%rcx,%rdx,8),%xmm0
40134b: mulsd -0x8(%rsi,%rdx,8),%xmm0
401351: addsd %xmm1,%xmm0
401355: movsd %xmm0,(%rsp)  ; <--- Un-hoisted store every loop iteration
40135a: movsd (%rcx,%rdx,8),%xmm1
40135f: mulsd (%rsi,%rdx,8),%xmm1
401364: addsd %xmm0,%xmm1
401368: movsd %xmm1,(%rsp)  ; <--- Un-hoisted store every loop iteration
40136d: movsd 0x8(%rcx,%rdx,8),%xmm0
401373: mulsd 0x8(%rsi,%rdx,8),%xmm0
401379: addsd %xmm1,%xmm0
40137d: movsd %xmm0,(%rsp)  ; <--- Un-hoisted store every loop iteration
401382: add $0x4,%rdx
401386: cmp %rdx,%rdi
401389: jne 401330 <.omp_outlined.+0x100> ; <--- x4-unrolled loop latch
40138b: mov $0x402028,%edi
401390: mov %r14d,%esi
401393: callq 401040 <__kmpc_for_static_fini at plt>
401398: mov %rsp,%rax ; <--- Load address of the stack slot to pass to
reduction logic
40139b: mov %rax,0x20(%rsp)

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20201210/1a09d435/attachment.html>