[llvm-bugs] [Bug 37427] New: [X86] [AVX2] Suboptimal code for vector select fetching cond from bool[] array

Fri May 11 14:45:54 PDT 2018

https://bugs.llvm.org/show_bug.cgi?id=37427

            Bug ID: 37427
           Summary: [X86] [AVX2] Suboptimal code for vector select
                    fetching cond from bool[] array
           Product: libraries
           Version: 6.0
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: fabiang at radgametools.com
                CC: llvm-bugs at lists.llvm.org

void variable_shift_loop(unsigned int *arr, const bool *control, int count, int
rot0, int rot1)
{
    for (int i = 0; i < count; ++i)
    {
        int rot = control[i] ? rot1 : rot0;
        arr[i] = (arr[i] << rot);
    }
}

Clang 6.0 targeting x86-64, "-O2 -mavx2".

The inner loop does:

  vpxor %xmm2, %xmm2, %xmm2
.LBB0_4: # =>This Inner Loop Header: Depth=1
  vpmovzxbw (%rsi,%rcx), %xmm3 # xmm3 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxbw 8(%rsi,%rcx), %xmm4 # xmm4 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxbw 16(%rsi,%rcx), %xmm5 # xmm5 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpmovzxbw 24(%rsi,%rcx), %xmm6 # xmm6 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
  vpcmpeqw %xmm2, %xmm3, %xmm3
  vpmovzxwd %xmm3, %ymm3 # ymm3 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
  vpslld $24, %ymm3, %ymm3
  vpcmpeqw %xmm2, %xmm4, %xmm4
  vpmovzxwd %xmm4, %ymm4 # ymm4 =
xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
  vpslld $24, %ymm4, %ymm4
  vpcmpeqw %xmm2, %xmm5, %xmm5
  vpmovzxwd %xmm5, %ymm5 # ymm5 =
xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
  vpslld $24, %ymm5, %ymm5
  vpcmpeqw %xmm2, %xmm6, %xmm6
  vpmovzxwd %xmm6, %ymm6 # ymm6 =
xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
  vpslld $24, %ymm6, %ymm6
  vblendvps %ymm3, %ymm0, %ymm1, %ymm3
  vblendvps %ymm4, %ymm0, %ymm1, %ymm4
  vblendvps %ymm5, %ymm0, %ymm1, %ymm5
  vblendvps %ymm6, %ymm0, %ymm1, %ymm6
  vmovdqu (%rdi,%rcx,4), %ymm7
  vpsllvd %ymm3, %ymm7, %ymm3
  vmovdqu 32(%rdi,%rcx,4), %ymm7
  vpsllvd %ymm4, %ymm7, %ymm4
  vmovdqu 64(%rdi,%rcx,4), %ymm7
  vpsllvd %ymm5, %ymm7, %ymm5
  vmovdqu 96(%rdi,%rcx,4), %ymm7
  vpsllvd %ymm6, %ymm7, %ymm6
  vmovdqu %ymm3, (%rdi,%rcx,4)
  vmovdqu %ymm4, 32(%rdi,%rcx,4)
  vmovdqu %ymm5, 64(%rdi,%rcx,4)
  vmovdqu %ymm6, 96(%rdi,%rcx,4)
  addq $32, %rcx
  cmpq %rcx, %rdx
  jne .LBB0_4

The intermediate stage of converting to words doesn't seem to be helping, it
just results in extra work.

This should just do 4x vpmovzxbd for the "control" loads, then 4x vpcmpeqd to
set up the blend masks, I think.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180511/c25944d8/attachment.html>