[llvm-bugs] [Bug 50168] New: LLVM produces more shuffles than necessary

Thu Apr 29 03:40:14 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50168

            Bug ID: 50168
           Summary: LLVM produces more shuffles than necessary
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: david.bolvansky at gmail.com
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, pengfei.wang at intel.com,
                    spatel+llvm at rotateright.com

>From Reddit:
"Clang successfully vectorizes the whole routine, though it uses more shuffles
than necessary. "

#include <stdint.h>

void filter(float *__restrict dst, const float *__restrict src, const float
*__restrict filter) {
    float pipeline[16] {};

    for(int i=0; i<1024; ++i) {
        float x = *src++;

        for(int j=0; j<16; ++j)
            pipeline[j] += filter[j] * x;

        *dst++ = pipeline[0];

        for(int j=0; j<15; ++j)
            pipeline[j] = pipeline[j+1];

        pipeline[15] = 0;
    }
}

Trunk -O3

.LCPI0_0:
  .long 0x00000000 # float 0
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
  movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
  movss dword ptr [rsp - 4], xmm0 # 4-byte Spill
  movups xmm10, xmmword ptr [rdx + 20]
  movups xmm11, xmmword ptr [rdx + 4]
  movups xmm12, xmmword ptr [rdx + 36]
  movss xmm13, dword ptr [rdx + 52] # xmm13 = mem[0],zero,zero,zero
  movss xmm14, dword ptr [rdx + 56] # xmm14 = mem[0],zero,zero,zero
  movss xmm15, dword ptr [rdx + 60] # xmm15 = mem[0],zero,zero,zero
  xorps xmm7, xmm7
  xor eax, eax
  xorps xmm1, xmm1
  xorps xmm0, xmm0
  xorps xmm5, xmm5
  xorps xmm9, xmm9
  xorps xmm3, xmm3
.LBB0_1: # =>This Inner Loop Header: Depth=1
  movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
  movaps xmm8, xmm6
  mulss xmm8, dword ptr [rsp - 4] # 4-byte Folded Reload
  addss xmm8, xmm7
  movaps xmm4, xmm6
  shufps xmm4, xmm6, 0 # xmm4 = xmm4[0,0],xmm6[0,0]
  movaps xmm2, xmm9
  shufps xmm2, xmm7, 48 # xmm2 = xmm2[0,0],xmm7[3,0]
  shufps xmm7, xmm2, 41 # xmm7 = xmm7[1,2],xmm2[2,0]
  movaps xmm2, xmm4
  mulps xmm2, xmm11
  addps xmm7, xmm2
  movaps xmm2, xmm3
  shufps xmm2, xmm9, 48 # xmm2 = xmm2[0,0],xmm9[3,0]
  shufps xmm9, xmm2, 41 # xmm9 = xmm9[1,2],xmm2[2,0]
  movaps xmm2, xmm4
  mulps xmm2, xmm10
  addps xmm9, xmm2
  shufps xmm1, xmm3, 244 # xmm1 = xmm1[0,1],xmm3[3,3]
  shufps xmm3, xmm1, 41 # xmm3 = xmm3[1,2],xmm1[2,0]
  mulps xmm4, xmm12
  addps xmm3, xmm4
  movaps xmm1, xmm6
  mulss xmm1, xmm13
  addss xmm1, xmm0
  movaps xmm0, xmm6
  mulss xmm0, xmm14
  addss xmm0, xmm5
  movss dword ptr [rdi + 4*rax], xmm8
  mulss xmm6, xmm15
  addss xmm6, dword ptr [rip + .LCPI0_0]
  add rax, 1
  movaps xmm5, xmm6
  cmp eax, 1024
  jne .LBB0_1
  ret

https://gcc.godbolt.org/z/ofz8qde9e

LLVM 12+ started to produce more shuffles.

LLVM 11 -O3
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
  xorps xmm2, xmm2
  movaps xmmword ptr [rsp - 24], xmm2
  movaps xmmword ptr [rsp - 40], xmm2
  movaps xmmword ptr [rsp - 56], xmm2
  movaps xmmword ptr [rsp - 72], xmm2
  xorps xmm8, xmm8
  xor eax, eax
  xorps xmm0, xmm0
  xorps xmm3, xmm3
  xorps xmm4, xmm4
.LBB0_1: # =>This Inner Loop Header: Depth=1
  movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
  movss xmm5, dword ptr [rdx] # xmm5 = mem[0],zero,zero,zero
  mulss xmm5, xmm6
  addss xmm5, xmm0
  movups xmm7, xmmword ptr [rdx + 8]
  movaps xmm1, xmm6
  shufps xmm1, xmm6, 0 # xmm1 = xmm1[0,0],xmm6[0,0]
  mulps xmm7, xmm1
  addps xmm7, xmm2
  movss xmm0, dword ptr [rdx + 4] # xmm0 = mem[0],zero,zero,zero
  mulss xmm0, xmm6
  movups xmmword ptr [rsp - 64], xmm7
  movups xmm2, xmmword ptr [rdx + 24]
  mulps xmm2, xmm1
  addps xmm2, xmm3
  movups xmmword ptr [rsp - 48], xmm2
  movups xmm2, xmmword ptr [rdx + 40]
  mulps xmm2, xmm1
  addps xmm2, xmm4
  movups xmmword ptr [rsp - 32], xmm2
  movss xmm1, dword ptr [rdx + 56] # xmm1 = mem[0],zero,zero,zero
  mulss xmm1, xmm6
  addss xmm1, dword ptr [rsp - 16]
  addss xmm0, xmm8
  movss dword ptr [rsp - 16], xmm1
  mulss xmm6, dword ptr [rdx + 60]
  addss xmm6, dword ptr [rsp - 12]
  movss dword ptr [rsp - 12], xmm6
  movss dword ptr [rdi + 4*rax], xmm5
  mov r8d, dword ptr [rsp - 64]
  movups xmm2, xmmword ptr [rsp - 60]
  movups xmmword ptr [rsp - 64], xmm2
  movups xmm3, xmmword ptr [rsp - 44]
  movups xmmword ptr [rsp - 48], xmm3
  mov ecx, dword ptr [rsp - 28]
  mov dword ptr [rsp - 32], ecx
  movaps xmm1, xmmword ptr [rsp - 24]
  movups xmmword ptr [rsp - 28], xmm1
  mov dword ptr [rsp - 12], 0
  movd xmm8, r8d
  movd xmm4, ecx
  shufps xmm4, xmm1, 0 # xmm4 = xmm4[0,0],xmm1[0,0]
  shufps xmm4, xmm1, 152 # xmm4 = xmm4[0,2],xmm1[1,2]
  add rax, 1
  cmp eax, 1024
  jne .LBB0_1
  ret

https://gcc.godbolt.org/z/4c3EKcxsa

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210429/aa0511bd/attachment.html>