[llvm-bugs] [Bug 52337] New: [X86] Suboptimal mask replication for masked interleaved load
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Oct 28 15:29:00 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=52337
Bug ID: 52337
Summary: [X86] Suboptimal mask replication for masked
interleaved load
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: lebedev.ri at gmail.com
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, pengfei.wang at intel.com,
spatel+llvm at rotateright.com
The IR for the masked interleaved load is something like this:
https://godbolt.org/z/6oYqGTfG4
define <32 x i32> @mask_i32_stride8_vf4(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
%src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
%src.vec.masked = and <4 x i32> %src.vec, <i32 2147483648, i32 2147483648,
i32 2147483648, i32 2147483648>
%src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
%tgt.vec = shufflevector <4 x i1> %src.vec.signbitset, <4 x i1> poison, <32 x
i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec, <32 x i32> undef)
ret <32 x i32> %res
}
declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)
And that results in a rather impressive assembly:
vpxor xmm0, xmm0, xmm0
vpcmpgtd xmm1, xmm0, xmmword ptr [rdi]
vpshufb xmm3, xmm1, xmmword ptr [rip + .LCPI0_0] # xmm3 =
xmm1[8,8,8,8,8,8,8,8,12,12,12,12,12,12,12,12]
vpbroadcastb xmm0, xmm1
vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 =
xmm1[4,u,4,u,4,u,4,u,4,u,4,u,4,u,4,u]
vpmovzxwd ymm0, xmm0 # ymm0 =
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmovzxbw xmm2, xmm3 # xmm2 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
vpmovzxwd ymm2, xmm2 # ymm2 =
xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
vpslld ymm0, ymm0, 31
vpmaskmovd ymm0, ymm0, ymmword ptr [rdx]
vpslld ymm2, ymm2, 31
vpmaskmovd ymm2, ymm2, ymmword ptr [rdx + 64]
vpmovzxwd ymm1, xmm1 # ymm1 =
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
vpslld ymm1, ymm1, 31
vpmaskmovd ymm1, ymm1, ymmword ptr [rdx + 32]
vpunpckhbw xmm3, xmm3, xmm3 # xmm3 =
xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
vpmovzxwd ymm3, xmm3 # ymm3 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
vpslld ymm3, ymm3, 31
vpmaskmovd ymm3, ymm3, ymmword ptr [rdx + 96]
I would think this may be better: https://godbolt.org/z/aWohrxsba
define <32 x i32> @mask_i32_stride8_vf4_good(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
%src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
%src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
%src.vec.signbitset1 = zext <4 x i1> %src.vec.signbitset to <4 x i32>
%src.vec.signbitset2 = shl <4 x i32> %src.vec.signbitset1, <i32 31, i32 31,
i32 31, i32 31>
%tgt.vec = shufflevector <4 x i32> %src.vec.signbitset2, <4 x i32> poison,
<32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%tgt.vec.signbitset = icmp slt <32 x i32> %tgt.vec, zeroinitializer
%res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec.signbitset, <32 x i32> undef)
ret <32 x i32> %res
}
declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)
mask_i32_stride8_vf4_good: # @mask_i32_stride8_vf4_good
vpxor xmm0, xmm0, xmm0
vpcmpgtd xmm0, xmm0, xmmword ptr [rdi]
vpslld xmm0, xmm0, 31
vpshufd xmm1, xmm0, 255 # xmm1 = xmm0[3,3,3,3]
vpbroadcastq ymm3, xmm1
vpshufd xmm1, xmm0, 250 # xmm1 = xmm0[2,2,3,3]
vpbroadcastq ymm2, xmm1
vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
vpbroadcastd ymm0, xmm0
vpbroadcastq ymm1, xmm1
vpmaskmovd ymm3, ymm3, ymmword ptr [rdx + 96]
vpmaskmovd ymm0, ymm0, ymmword ptr [rdx]
vpmaskmovd ymm2, ymm2, ymmword ptr [rdx + 64]
vpmaskmovd ymm1, ymm1, ymmword ptr [rdx + 32]
ret
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20211028/630f8f66/attachment.html>
More information about the llvm-bugs
mailing list