<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Generate inefficient code after canonicalize splat shuffle after cmp"

   href="https://bugs.llvm.org/show_bug.cgi?id=52500">52500</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Generate inefficient code after canonicalize splat shuffle after cmp

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>ken1979.luo@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>On X86-AVX512 the result of the vector compare instruction would be in %k

register, but there is no shuffle instruction for %k register. Here is the test

case that was regressed due to canonicalize splat shuffle after cmp. It can be

duplicated with "llc -mcpu=skylake-avx512".

Before canonicalization, the code is as this.

cat shufvXi32.ll

define <16 x i1> @shuffle(<16 x i1> %msk, i32 %in) {

entry:

  %insrt = insertelement <16 x i32> undef, i32 %in, i32 0

  %splat = shufflevector <16 x i32> %insrt, <16 x i32> poison, <16 x i32>

zeroinitializer

  %mul = mul <16 x i32> <i32 789, i32 789, i32 789, i32 789, i32 789, i32 789,

i32 789, i32 789, i32 789, i32 789, i32 789, i32 789, i32 789, i32 789, i32

789, i32 789>, %splat

  %cmp1 = icmp eq <16 x i32> %mul, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,

i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>

  %and = and <16 x i1> %msk, %cmp1

  ret <16 x i1> %and

}

After canonicalizaton, the code is as this.

opt -S < shufvxi32.ll -instcombine -o shufvXi1.ll

define <16 x i1> @shuffle(<16 x i1> %msk, i32 %in) {

entry:

  %insrt = insertelement <16 x i32> undef, i32 %in, i32 0

  %0 = mul <16 x i32> %insrt, <i32 789, i32 poison, i32 poison, i32 poison, i32

poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,

i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

  %1 = icmp eq <16 x i32> %0, zeroinitializer

  %cmp1 = shufflevector <16 x i1> %1, <16 x i1> poison, <16 x i32>

zeroinitializer

  %and = and <16 x i1> %cmp1, %msk

  ret <16 x i1> %and

}

llc -mcpu=skylake-avx512 shufvXi32.ll

We got below assembly

# %bb.0:                                # %entry

        vpsllw  $7, %xmm0, %xmm0

        vpmovb2m        %xmm0, %k1

        vpbroadcastd    %edi, %zmm0

        vpmulld .LCPI0_0(%rip){1to16}, %zmm0, %zmm0

        vptestnmd       %zmm0, %zmm0, %k0 {%k1}

        vpmovm2b        %k0, %xmm0

        vzeroupper

        retq

llc -mcpu=skylake-avx512 shufvXi1.ll

We got below assembly.

# %bb.0:                                # %entry

        vpsllw  $7, %xmm0, %xmm0

        vpxor   %xmm1, %xmm1, %xmm1

        vmovd   %edi, %xmm2

        movl    $789, %eax                      # imm = 0x315

        vmovd   %eax, %xmm3

        vpmulld %xmm3, %xmm2, %xmm2

        vptestnmd       %zmm2, %zmm2, %k0

        vpmovm2w        %k0, %ymm2

        vpbroadcastw    %xmm2, %ymm2

        vpmovw2m        %ymm2, %k1

        vpcmpgtb        %xmm0, %xmm1, %k0 {%k1}

        vpmovm2b        %k0, %xmm0

        vzeroupper

        retq

We can see there is more instruction generated for shufvXi1.ll.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>