<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - The number of SIMD loads increases unnecessarily"

   href="https://bugs.llvm.org/show_bug.cgi?id=47558">47558</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>The number of SIMD loads increases unnecessarily

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>kazu@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This patch:

<a href="https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd">https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd</a>

increases the number of memory loads in certain cases.

Consider:

target triple = "x86_64-unknown-linux-gnu"

declare dso_local float* @getscaleptr() #0

define void @foo(<2 x float>* nonnull %resultptr, <2 x float>* nonnull %opptr)

{

  %scaleptr = call nonnull align 16 dereferenceable(64) float* @getscaleptr()

  %op = load <2 x float>, <2 x float>* %opptr, align 4

  %scale = load float, float* %scaleptr, align 16

  %op0 = extractelement <2 x float> %op, i32 0

  %product0 = fmul float %op0, %scale

  %result0 = insertelement <2 x float> undef, float %product0, i32 0

  %op1 = extractelement <2 x float> %op, i32 1

  %product1 = fmul float %op1, %scale

  %result1 = insertelement <2 x float> %result0, float %product1, i32 1

  store <2 x float> %result1, <2 x float>* %resultptr, align 8

  ret void

}

This testcase multiplies a <2 x float> value by a scalar float value

and stores the result back to memory.

Compile like so:

$ clang -O2 -msse4.2 -S bug.ll -o bug.s

Then with and without the patch, I get the following assembly diff:

        pushq   %r14

        pushq   %rbx

        pushq   %rax

        movq    %rsi, %rbx

        movq    %rdi, %r14

        callq   getscaleptr

        movsd   (%rbx), %xmm0                   # xmm0 = mem[0],zero

-       movss   (%rax), %xmm1                   # xmm1 = mem[0],zero,zero,zero

-       movsldup        %xmm1, %xmm1                    # xmm1 = xmm1[0,0,2,2]

+       movaps  (%rax), %xmm1

+       insertps        $16, (%rax), %xmm1              # xmm1 =

xmm1[0],mem[0],xmm1[2,3]

        mulps   %xmm0, %xmm1

        movlps  %xmm1, (%r14)

        addq    $8, %rsp

        popq    %rbx

        popq    %r14

        retq

Note that the patch replaces movsldup with insertps, which reads from

the same location as movaps, increasing the number of loads.

Here is the "IR Dump After Optimize scalar/vector ops".

Without the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*

@getscaleptr()

  %op = load <2 x float>, <2 x float>* %opptr, align 4

  %scale = load float, float* %scaleptr, align 16

  %1 = insertelement <2 x float> undef, float %scale, i32 0

  %2 = insertelement <2 x float> %1, float %scale, i32 1

  %3 = fmul <2 x float> %op, %2

  %4 = extractelement <2 x float> %3, i32 0

  %result0 = insertelement <2 x float> undef, float %4, i32 0

  %5 = extractelement <2 x float> %3, i32 1

  %result1 = insertelement <2 x float> %result0, float %5, i32 1

  store <2 x float> %result1, <2 x float>* %resultptr, align 8

  ret void

With the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*

@getscaleptr()

  %op = load <2 x float>, <2 x float>* %opptr, align 4

  %1 = bitcast float* %scaleptr to <4 x float>*

  %2 = load <4 x float>, <4 x float>* %1, align 16

  %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32

1>

  %scale = load float, float* %scaleptr, align 16

  %4 = insertelement <2 x float> %3, float %scale, i32 1

  %5 = fmul <2 x float> %op, %4

  %6 = extractelement <2 x float> %5, i32 0

  %result0 = insertelement <2 x float> undef, float %6, i32 0

  %7 = extractelement <2 x float> %5, i32 1

  %result1 = insertelement <2 x float> %result0, float %7, i32 1

  store <2 x float> %result1, <2 x float>* %resultptr, align 8

  ret void

Notice the three loads with the patch.

Here is the final LLVM IR.

Without the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*

@getscaleptr()

  %op = load <2 x float>, <2 x float>* %opptr, align 4

  %scale = load float, float* %scaleptr, align 16

  %1 = insertelement <2 x float> undef, float %scale, i32 0

  %2 = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32>

zeroinitializer

  %3 = fmul <2 x float> %op, %2

  store <2 x float> %3, <2 x float>* %resultptr, align 8

  ret void

With the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*

@getscaleptr()

  %op = load <2 x float>, <2 x float>* %opptr, align 4

  %1 = bitcast float* %scaleptr to <4 x float>*

  %2 = load <4 x float>, <4 x float>* %1, align 16

  %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32

undef>

  %scale = load float, float* %scaleptr, align 16

  %4 = insertelement <2 x float> %3, float %scale, i32 1

  %5 = fmul <2 x float> %op, %4

  store <2 x float> %5, <2 x float>* %resultptr, align 8

  ret void</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>