<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - LLVM produces more shuffles than necessary"

   href="https://bugs.llvm.org/show_bug.cgi?id=50168">50168</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>LLVM produces more shuffles than necessary

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>david.bolvansky@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>From Reddit:

"Clang successfully vectorizes the whole routine, though it uses more shuffles

than necessary. "

#include <stdint.h>

void filter(float *__restrict dst, const float *__restrict src, const float

*__restrict filter) {

    float pipeline[16] {};

    for(int i=0; i<1024; ++i) {

        float x = *src++;

        for(int j=0; j<16; ++j)

            pipeline[j] += filter[j] * x;

        *dst++ = pipeline[0];

        for(int j=0; j<15; ++j)

            pipeline[j] = pipeline[j+1];

        pipeline[15] = 0;

    }

}

Trunk -O3

.LCPI0_0:

  .long 0x00000000 # float 0

filter(float*, float const*, float const*): # @filter(float*, float const*,

float const*)

  movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero

  movss dword ptr [rsp - 4], xmm0 # 4-byte Spill

  movups xmm10, xmmword ptr [rdx + 20]

  movups xmm11, xmmword ptr [rdx + 4]

  movups xmm12, xmmword ptr [rdx + 36]

  movss xmm13, dword ptr [rdx + 52] # xmm13 = mem[0],zero,zero,zero

  movss xmm14, dword ptr [rdx + 56] # xmm14 = mem[0],zero,zero,zero

  movss xmm15, dword ptr [rdx + 60] # xmm15 = mem[0],zero,zero,zero

  xorps xmm7, xmm7

  xor eax, eax

  xorps xmm1, xmm1

  xorps xmm0, xmm0

  xorps xmm5, xmm5

  xorps xmm9, xmm9

  xorps xmm3, xmm3

.LBB0_1: # =>This Inner Loop Header: Depth=1

  movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero

  movaps xmm8, xmm6

  mulss xmm8, dword ptr [rsp - 4] # 4-byte Folded Reload

  addss xmm8, xmm7

  movaps xmm4, xmm6

  shufps xmm4, xmm6, 0 # xmm4 = xmm4[0,0],xmm6[0,0]

  movaps xmm2, xmm9

  shufps xmm2, xmm7, 48 # xmm2 = xmm2[0,0],xmm7[3,0]

  shufps xmm7, xmm2, 41 # xmm7 = xmm7[1,2],xmm2[2,0]

  movaps xmm2, xmm4

  mulps xmm2, xmm11

  addps xmm7, xmm2

  movaps xmm2, xmm3

  shufps xmm2, xmm9, 48 # xmm2 = xmm2[0,0],xmm9[3,0]

  shufps xmm9, xmm2, 41 # xmm9 = xmm9[1,2],xmm2[2,0]

  movaps xmm2, xmm4

  mulps xmm2, xmm10

  addps xmm9, xmm2

  shufps xmm1, xmm3, 244 # xmm1 = xmm1[0,1],xmm3[3,3]

  shufps xmm3, xmm1, 41 # xmm3 = xmm3[1,2],xmm1[2,0]

  mulps xmm4, xmm12

  addps xmm3, xmm4

  movaps xmm1, xmm6

  mulss xmm1, xmm13

  addss xmm1, xmm0

  movaps xmm0, xmm6

  mulss xmm0, xmm14

  addss xmm0, xmm5

  movss dword ptr [rdi + 4*rax], xmm8

  mulss xmm6, xmm15

  addss xmm6, dword ptr [rip + .LCPI0_0]

  add rax, 1

  movaps xmm5, xmm6

  cmp eax, 1024

  jne .LBB0_1

  ret

<a href="https://gcc.godbolt.org/z/ofz8qde9e">https://gcc.godbolt.org/z/ofz8qde9e</a>

LLVM 12+ started to produce more shuffles.

LLVM 11 -O3

filter(float*, float const*, float const*): # @filter(float*, float const*,

float const*)

  xorps xmm2, xmm2

  movaps xmmword ptr [rsp - 24], xmm2

  movaps xmmword ptr [rsp - 40], xmm2

  movaps xmmword ptr [rsp - 56], xmm2

  movaps xmmword ptr [rsp - 72], xmm2

  xorps xmm8, xmm8

  xor eax, eax

  xorps xmm0, xmm0

  xorps xmm3, xmm3

  xorps xmm4, xmm4

.LBB0_1: # =>This Inner Loop Header: Depth=1

  movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero

  movss xmm5, dword ptr [rdx] # xmm5 = mem[0],zero,zero,zero

  mulss xmm5, xmm6

  addss xmm5, xmm0

  movups xmm7, xmmword ptr [rdx + 8]

  movaps xmm1, xmm6

  shufps xmm1, xmm6, 0 # xmm1 = xmm1[0,0],xmm6[0,0]

  mulps xmm7, xmm1

  addps xmm7, xmm2

  movss xmm0, dword ptr [rdx + 4] # xmm0 = mem[0],zero,zero,zero

  mulss xmm0, xmm6

  movups xmmword ptr [rsp - 64], xmm7

  movups xmm2, xmmword ptr [rdx + 24]

  mulps xmm2, xmm1

  addps xmm2, xmm3

  movups xmmword ptr [rsp - 48], xmm2

  movups xmm2, xmmword ptr [rdx + 40]

  mulps xmm2, xmm1

  addps xmm2, xmm4

  movups xmmword ptr [rsp - 32], xmm2

  movss xmm1, dword ptr [rdx + 56] # xmm1 = mem[0],zero,zero,zero

  mulss xmm1, xmm6

  addss xmm1, dword ptr [rsp - 16]

  addss xmm0, xmm8

  movss dword ptr [rsp - 16], xmm1

  mulss xmm6, dword ptr [rdx + 60]

  addss xmm6, dword ptr [rsp - 12]

  movss dword ptr [rsp - 12], xmm6

  movss dword ptr [rdi + 4*rax], xmm5

  mov r8d, dword ptr [rsp - 64]

  movups xmm2, xmmword ptr [rsp - 60]

  movups xmmword ptr [rsp - 64], xmm2

  movups xmm3, xmmword ptr [rsp - 44]

  movups xmmword ptr [rsp - 48], xmm3

  mov ecx, dword ptr [rsp - 28]

  mov dword ptr [rsp - 32], ecx

  movaps xmm1, xmmword ptr [rsp - 24]

  movups xmmword ptr [rsp - 28], xmm1

  mov dword ptr [rsp - 12], 0

  movd xmm8, r8d

  movd xmm4, ecx

  shufps xmm4, xmm1, 0 # xmm4 = xmm4[0,0],xmm1[0,0]

  shufps xmm4, xmm1, 152 # xmm4 = xmm4[0,2],xmm1[1,2]

  add rax, 1

  cmp eax, 1024

  jne .LBB0_1

  ret

<a href="https://gcc.godbolt.org/z/4c3EKcxsa">https://gcc.godbolt.org/z/4c3EKcxsa</a></pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>