<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [X86] [AVX2] Suboptimal code for vector select fetching cond from bool[] array"

   href="https://bugs.llvm.org/show_bug.cgi?id=37427">37427</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[X86] [AVX2] Suboptimal code for vector select fetching cond from bool[] array

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>6.0

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>fabiang@radgametools.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>void variable_shift_loop(unsigned int *arr, const bool *control, int count, int

rot0, int rot1)

{

    for (int i = 0; i < count; ++i)

    {

        int rot = control[i] ? rot1 : rot0;

        arr[i] = (arr[i] << rot);

    }

}

Clang 6.0 targeting x86-64, "-O2 -mavx2".

The inner loop does:

  vpxor %xmm2, %xmm2, %xmm2

.LBB0_4: # =>This Inner Loop Header: Depth=1

  vpmovzxbw (%rsi,%rcx), %xmm3 # xmm3 =

mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero

  vpmovzxbw 8(%rsi,%rcx), %xmm4 # xmm4 =

mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero

  vpmovzxbw 16(%rsi,%rcx), %xmm5 # xmm5 =

mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero

  vpmovzxbw 24(%rsi,%rcx), %xmm6 # xmm6 =

mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero

  vpcmpeqw %xmm2, %xmm3, %xmm3

  vpmovzxwd %xmm3, %ymm3 # ymm3 =

xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero

  vpslld $24, %ymm3, %ymm3

  vpcmpeqw %xmm2, %xmm4, %xmm4

  vpmovzxwd %xmm4, %ymm4 # ymm4 =

xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero

  vpslld $24, %ymm4, %ymm4

  vpcmpeqw %xmm2, %xmm5, %xmm5

  vpmovzxwd %xmm5, %ymm5 # ymm5 =

xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero

  vpslld $24, %ymm5, %ymm5

  vpcmpeqw %xmm2, %xmm6, %xmm6

  vpmovzxwd %xmm6, %ymm6 # ymm6 =

xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero

  vpslld $24, %ymm6, %ymm6

  vblendvps %ymm3, %ymm0, %ymm1, %ymm3

  vblendvps %ymm4, %ymm0, %ymm1, %ymm4

  vblendvps %ymm5, %ymm0, %ymm1, %ymm5

  vblendvps %ymm6, %ymm0, %ymm1, %ymm6

  vmovdqu (%rdi,%rcx,4), %ymm7

  vpsllvd %ymm3, %ymm7, %ymm3

  vmovdqu 32(%rdi,%rcx,4), %ymm7

  vpsllvd %ymm4, %ymm7, %ymm4

  vmovdqu 64(%rdi,%rcx,4), %ymm7

  vpsllvd %ymm5, %ymm7, %ymm5

  vmovdqu 96(%rdi,%rcx,4), %ymm7

  vpsllvd %ymm6, %ymm7, %ymm6

  vmovdqu %ymm3, (%rdi,%rcx,4)

  vmovdqu %ymm4, 32(%rdi,%rcx,4)

  vmovdqu %ymm5, 64(%rdi,%rcx,4)

  vmovdqu %ymm6, 96(%rdi,%rcx,4)

  addq $32, %rcx

  cmpq %rcx, %rdx

  jne .LBB0_4

The intermediate stage of converting to words doesn't seem to be helping, it

just results in extra work.

This should just do 4x vpmovzxbd for the "control" loads, then 4x vpcmpeqd to

set up the blend masks, I think.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>