<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [X86] Suboptimal mask replication for masked interleaved load"

   href="https://bugs.llvm.org/show_bug.cgi?id=52337">52337</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[X86] Suboptimal mask replication for masked interleaved load

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>lebedev.ri@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The IR for the masked interleaved load is something like this:

<a href="https://godbolt.org/z/6oYqGTfG4">https://godbolt.org/z/6oYqGTfG4</a>

define <32 x i32> @mask_i32_stride8_vf4(<4 x i32>* %in.vec, <32 x i32>*

%out.vec, <32 x i32>*%ptr) #0 {

  %src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32

  %src.vec.masked = and <4 x i32> %src.vec, <i32 2147483648, i32 2147483648,

i32 2147483648, i32 2147483648>

  %src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer

  %tgt.vec = shufflevector <4 x i1> %src.vec.signbitset, <4 x i1> poison, <32 x

i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32

1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,

i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>

  %res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>

%tgt.vec, <32 x i32> undef)

  ret <32 x i32> %res

}

declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)

And that results in a rather impressive assembly:

        vpxor   xmm0, xmm0, xmm0

        vpcmpgtd        xmm1, xmm0, xmmword ptr [rdi]

        vpshufb xmm3, xmm1, xmmword ptr [rip + .LCPI0_0] # xmm3 =

xmm1[8,8,8,8,8,8,8,8,12,12,12,12,12,12,12,12]

        vpbroadcastb    xmm0, xmm1

        vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 =

xmm1[4,u,4,u,4,u,4,u,4,u,4,u,4,u,4,u]

        vpmovzxwd       ymm0, xmm0              # ymm0 =

xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero

        vpmovzxbw       xmm2, xmm3              # xmm2 =

xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero

        vpmovzxwd       ymm2, xmm2              # ymm2 =

xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero

        vpslld  ymm0, ymm0, 31

        vpmaskmovd      ymm0, ymm0, ymmword ptr [rdx]

        vpslld  ymm2, ymm2, 31

        vpmaskmovd      ymm2, ymm2, ymmword ptr [rdx + 64]

        vpmovzxwd       ymm1, xmm1              # ymm1 =

xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero

        vpslld  ymm1, ymm1, 31

        vpmaskmovd      ymm1, ymm1, ymmword ptr [rdx + 32]

        vpunpckhbw      xmm3, xmm3, xmm3        # xmm3 =

xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]

        vpmovzxwd       ymm3, xmm3              # ymm3 =

xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero

        vpslld  ymm3, ymm3, 31

        vpmaskmovd      ymm3, ymm3, ymmword ptr [rdx + 96]

I would think this may be better: <a href="https://godbolt.org/z/aWohrxsba">https://godbolt.org/z/aWohrxsba</a>

define <32 x i32> @mask_i32_stride8_vf4_good(<4 x i32>* %in.vec, <32 x i32>*

%out.vec, <32 x i32>*%ptr) #0 {

  %src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32

  %src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer

  %src.vec.signbitset1 = zext <4 x i1> %src.vec.signbitset to <4 x i32>

  %src.vec.signbitset2 = shl <4 x i32> %src.vec.signbitset1, <i32 31, i32 31,

i32 31, i32 31>

  %tgt.vec = shufflevector <4 x i32> %src.vec.signbitset2, <4 x i32> poison,

<32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32

1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2,

i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>

  %tgt.vec.signbitset = icmp slt <32 x i32> %tgt.vec, zeroinitializer

  %res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>

%tgt.vec.signbitset, <32 x i32> undef)

  ret <32 x i32> %res

}

declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)

mask_i32_stride8_vf4_good:              # @mask_i32_stride8_vf4_good

        vpxor   xmm0, xmm0, xmm0

        vpcmpgtd        xmm0, xmm0, xmmword ptr [rdi]

        vpslld  xmm0, xmm0, 31

        vpshufd xmm1, xmm0, 255                 # xmm1 = xmm0[3,3,3,3]

        vpbroadcastq    ymm3, xmm1

        vpshufd xmm1, xmm0, 250                 # xmm1 = xmm0[2,2,3,3]

        vpbroadcastq    ymm2, xmm1

        vpshufd xmm1, xmm0, 85                  # xmm1 = xmm0[1,1,1,1]

        vpbroadcastd    ymm0, xmm0

        vpbroadcastq    ymm1, xmm1

        vpmaskmovd      ymm3, ymm3, ymmword ptr [rdx + 96]

        vpmaskmovd      ymm0, ymm0, ymmword ptr [rdx]

        vpmaskmovd      ymm2, ymm2, ymmword ptr [rdx + 64]

        vpmaskmovd      ymm1, ymm1, ymmword ptr [rdx + 32]

        ret</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>