[llvm-bugs] [Bug 52337] New: [X86] Suboptimal mask replication for masked interleaved load

Thu Oct 28 15:29:00 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=52337

            Bug ID: 52337
           Summary: [X86] Suboptimal mask replication for masked
                    interleaved load
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: lebedev.ri at gmail.com
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, pengfei.wang at intel.com,
                    spatel+llvm at rotateright.com

The IR for the masked interleaved load is something like this:
https://godbolt.org/z/6oYqGTfG4

define <32 x i32> @mask_i32_stride8_vf4(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
  %src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
  %src.vec.masked = and <4 x i32> %src.vec, <i32 2147483648, i32 2147483648,
i32 2147483648, i32 2147483648>
  %src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
  %tgt.vec = shufflevector <4 x i1> %src.vec.signbitset, <4 x i1> poison, <32 x
i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec, <32 x i32> undef)
  ret <32 x i32> %res
}
declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)

And that results in a rather impressive assembly:

        vpxor   xmm0, xmm0, xmm0
        vpcmpgtd        xmm1, xmm0, xmmword ptr [rdi]
        vpshufb xmm3, xmm1, xmmword ptr [rip + .LCPI0_0] # xmm3 =
xmm1[8,8,8,8,8,8,8,8,12,12,12,12,12,12,12,12]
        vpbroadcastb    xmm0, xmm1
        vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 =
xmm1[4,u,4,u,4,u,4,u,4,u,4,u,4,u,4,u]
        vpmovzxwd       ymm0, xmm0              # ymm0 =
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
        vpmovzxbw       xmm2, xmm3              # xmm2 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
        vpmovzxwd       ymm2, xmm2              # ymm2 =
xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
        vpslld  ymm0, ymm0, 31
        vpmaskmovd      ymm0, ymm0, ymmword ptr [rdx]
        vpslld  ymm2, ymm2, 31
        vpmaskmovd      ymm2, ymm2, ymmword ptr [rdx + 64]
        vpmovzxwd       ymm1, xmm1              # ymm1 =
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
        vpslld  ymm1, ymm1, 31
        vpmaskmovd      ymm1, ymm1, ymmword ptr [rdx + 32]
        vpunpckhbw      xmm3, xmm3, xmm3        # xmm3 =
xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
        vpmovzxwd       ymm3, xmm3              # ymm3 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
        vpslld  ymm3, ymm3, 31
        vpmaskmovd      ymm3, ymm3, ymmword ptr [rdx + 96]

I would think this may be better: https://godbolt.org/z/aWohrxsba

define <32 x i32> @mask_i32_stride8_vf4_good(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
  %src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
  %src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
  %src.vec.signbitset1 = zext <4 x i1> %src.vec.signbitset to <4 x i32>
  %src.vec.signbitset2 = shl <4 x i32> %src.vec.signbitset1, <i32 31, i32 31,
i32 31, i32 31>
  %tgt.vec = shufflevector <4 x i32> %src.vec.signbitset2, <4 x i32> poison,
<32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %tgt.vec.signbitset = icmp slt <32 x i32> %tgt.vec, zeroinitializer
  %res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec.signbitset, <32 x i32> undef)
  ret <32 x i32> %res
}

declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)

mask_i32_stride8_vf4_good:              # @mask_i32_stride8_vf4_good
        vpxor   xmm0, xmm0, xmm0
        vpcmpgtd        xmm0, xmm0, xmmword ptr [rdi]
        vpslld  xmm0, xmm0, 31
        vpshufd xmm1, xmm0, 255                 # xmm1 = xmm0[3,3,3,3]
        vpbroadcastq    ymm3, xmm1
        vpshufd xmm1, xmm0, 250                 # xmm1 = xmm0[2,2,3,3]
        vpbroadcastq    ymm2, xmm1
        vpshufd xmm1, xmm0, 85                  # xmm1 = xmm0[1,1,1,1]
        vpbroadcastd    ymm0, xmm0
        vpbroadcastq    ymm1, xmm1
        vpmaskmovd      ymm3, ymm3, ymmword ptr [rdx + 96]
        vpmaskmovd      ymm0, ymm0, ymmword ptr [rdx]
        vpmaskmovd      ymm2, ymm2, ymmword ptr [rdx + 64]
        vpmaskmovd      ymm1, ymm1, ymmword ptr [rdx + 32]
        ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211028/630f8f66/attachment.html>