[llvm-bugs] [Bug 48475] New: Redundant stores inside reduction loop bodies
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Dec 10 12:48:11 PST 2020
https://bugs.llvm.org/show_bug.cgi?id=48475
Bug ID: 48475
Summary: Redundant stores inside reduction loop bodies
Product: OpenMP
Version: unspecified
Hardware: PC
OS: Windows NT
Status: NEW
Severity: enhancement
Priority: P
Component: Clang Compiler Support
Assignee: unassignedclangbugs at nondot.org
Reporter: itay.bookstein at nextsilicon.com
CC: llvm-bugs at lists.llvm.org
The following sample generates code where the per-thread reduction loop
performs a store to a stack slot *every iteration*, rather than once after the
entire loop. The address of the stack slot is later taken to pass to the
inter-thread reduction, which is probably what causes this optimization
opportunity to elude the escape analysis.
The code was compiled using clang++-12 Ubuntu focal unstable branch, using the
command-line:
clang++-12 -fopenmp -O3 main.cpp -o main
// main.cpp
#include <cstdio>
#include <memory>
double compute_dot_product(size_t n, double *xv, double *yv)
{
double local = 0.0;
#pragma omp parallel for reduction (+:local)
for (size_t i = 0; i < n; i++) local += xv[i] * yv[i];
return local;
}
int main(int argc, char **argv)
{
constexpr size_t n = 0x1000;
auto xv = std::make_unique<double[]>(n);
auto yv = std::make_unique<double[]>(n);
double result = compute_dot_product(n, xv.get(), yv.get());
printf("result = %e\n", result);
return 0;
}
// Disassembly excerpt from objdump -d --no-show-raw-insn main,
function <...>omp_outlined<...>
4012f0: movsd (%rdx,%rcx,8),%xmm1 ; <--- Non-unrolled loop head
4012f5: mulsd (%rsi,%rcx,8),%xmm1
4012fa: addsd %xmm1,%xmm0
4012fe: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration
401303: add $0x1,%rcx
401307: add $0xffffffffffffffff,%rax
40130b: jne 4012f0 <.omp_outlined.+0xc0> ; <--- Non-unrolled loop latch
40130d: cmp $0x3,%rbp
401311: jb 40138b <.omp_outlined.+0x15b> ; <--- x4-unrolled loop guard
401313: sub %rcx,%rdi ; <--- x4-unrolled loop preheader
401316: lea (%rsi,%rcx,8),%rsi
40131a: add $0x18,%rsi
40131e: lea (%rdx,%rcx,8),%rcx
401322: add $0x18,%rcx
401326: mov $0xffffffffffffffff,%rdx
40132d: nopl (%rax)
401330: movsd -0x10(%rcx,%rdx,8),%xmm1 ; <--- x4-unrolled loop head
401336: mulsd -0x10(%rsi,%rdx,8),%xmm1
40133c: addsd %xmm0,%xmm1
401340: movsd %xmm1,(%rsp) ; <--- Un-hoisted store every loop iteration
401345: movsd -0x8(%rcx,%rdx,8),%xmm0
40134b: mulsd -0x8(%rsi,%rdx,8),%xmm0
401351: addsd %xmm1,%xmm0
401355: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration
40135a: movsd (%rcx,%rdx,8),%xmm1
40135f: mulsd (%rsi,%rdx,8),%xmm1
401364: addsd %xmm0,%xmm1
401368: movsd %xmm1,(%rsp) ; <--- Un-hoisted store every loop iteration
40136d: movsd 0x8(%rcx,%rdx,8),%xmm0
401373: mulsd 0x8(%rsi,%rdx,8),%xmm0
401379: addsd %xmm1,%xmm0
40137d: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration
401382: add $0x4,%rdx
401386: cmp %rdx,%rdi
401389: jne 401330 <.omp_outlined.+0x100> ; <--- x4-unrolled loop latch
40138b: mov $0x402028,%edi
401390: mov %r14d,%esi
401393: callq 401040 <__kmpc_for_static_fini at plt>
401398: mov %rsp,%rax ; <--- Load address of the stack slot to pass to
reduction logic
40139b: mov %rax,0x20(%rsp)
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20201210/1a09d435/attachment.html>
More information about the llvm-bugs
mailing list