<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Redundant stores inside reduction loop bodies"

   href="https://bugs.llvm.org/show_bug.cgi?id=48475">48475</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Redundant stores inside reduction loop bodies

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>OpenMP

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>unspecified

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Clang Compiler Support

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>itay.bookstein@nextsilicon.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The following sample generates code where the per-thread reduction loop

performs a store to a stack slot *every iteration*, rather than once after the

entire loop. The address of the stack slot is later taken to pass to the

inter-thread reduction, which is probably what causes this optimization

opportunity to elude the escape analysis.

The code was compiled using clang++-12 Ubuntu focal unstable branch, using the

command-line:

clang++-12 -fopenmp -O3 main.cpp -o main

// main.cpp

#include <cstdio>

#include <memory>

double compute_dot_product(size_t n, double *xv, double *yv)

{

  double local = 0.0;

  #pragma omp parallel for reduction (+:local)

  for (size_t i = 0; i < n; i++) local += xv[i] * yv[i];

  return local;

}

int main(int argc, char **argv)

{

  constexpr size_t n = 0x1000;

  auto xv = std::make_unique<double[]>(n);

  auto yv = std::make_unique<double[]>(n);

  double result = compute_dot_product(n, xv.get(), yv.get());

  printf("result = %e\n", result);

  return 0;

}

// Disassembly excerpt from objdump -d --no-show-raw-insn main,

function <...>omp_outlined<...>

4012f0: movsd (%rdx,%rcx,8),%xmm1 ; <--- Non-unrolled loop head

4012f5: mulsd (%rsi,%rcx,8),%xmm1

4012fa: addsd %xmm1,%xmm0

4012fe: movsd %xmm0,(%rsp) ; <--- Un-hoisted store every loop iteration

401303: add $0x1,%rcx

401307: add $0xffffffffffffffff,%rax

40130b: jne 4012f0 <.omp_outlined.+0xc0>  ; <--- Non-unrolled loop latch

40130d: cmp $0x3,%rbp

401311: jb 40138b <.omp_outlined.+0x15b> ; <--- x4-unrolled loop guard

401313: sub %rcx,%rdi ; <--- x4-unrolled loop preheader

401316: lea (%rsi,%rcx,8),%rsi

40131a: add $0x18,%rsi

40131e: lea (%rdx,%rcx,8),%rcx

401322: add $0x18,%rcx

401326: mov $0xffffffffffffffff,%rdx

40132d: nopl (%rax)

401330: movsd -0x10(%rcx,%rdx,8),%xmm1 ; <--- x4-unrolled loop head

401336: mulsd -0x10(%rsi,%rdx,8),%xmm1

40133c: addsd %xmm0,%xmm1

401340: movsd %xmm1,(%rsp) ; <--- Un-hoisted store every loop iteration

401345: movsd -0x8(%rcx,%rdx,8),%xmm0

40134b: mulsd -0x8(%rsi,%rdx,8),%xmm0

401351: addsd %xmm1,%xmm0

401355: movsd %xmm0,(%rsp)  ; <--- Un-hoisted store every loop iteration

40135a: movsd (%rcx,%rdx,8),%xmm1

40135f: mulsd (%rsi,%rdx,8),%xmm1

401364: addsd %xmm0,%xmm1

401368: movsd %xmm1,(%rsp)  ; <--- Un-hoisted store every loop iteration

40136d: movsd 0x8(%rcx,%rdx,8),%xmm0

401373: mulsd 0x8(%rsi,%rdx,8),%xmm0

401379: addsd %xmm1,%xmm0

40137d: movsd %xmm0,(%rsp)  ; <--- Un-hoisted store every loop iteration

401382: add $0x4,%rdx

401386: cmp %rdx,%rdi

401389: jne 401330 <.omp_outlined.+0x100> ; <--- x4-unrolled loop latch

40138b: mov $0x402028,%edi

401390: mov %r14d,%esi

401393: callq 401040 <__kmpc_for_static_fini@plt>

401398: mov %rsp,%rax ; <--- Load address of the stack slot to pass to

reduction logic

40139b: mov %rax,0x20(%rsp)</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>