<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - [X86] Suboptimal mask replication for masked interleaved load"
href="https://bugs.llvm.org/show_bug.cgi?id=52337">52337</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>[X86] Suboptimal mask replication for masked interleaved load
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>lebedev.ri@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com
</td>
</tr></table>
<p>
<div>
<pre>The IR for the masked interleaved load is something like this:
<a href="https://godbolt.org/z/6oYqGTfG4">https://godbolt.org/z/6oYqGTfG4</a>
define <32 x i32> @mask_i32_stride8_vf4(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
%src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
%src.vec.masked = and <4 x i32> %src.vec, <i32 2147483648, i32 2147483648,
i32 2147483648, i32 2147483648>
%src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
%tgt.vec = shufflevector <4 x i1> %src.vec.signbitset, <4 x i1> poison, <32 x
i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec, <32 x i32> undef)
ret <32 x i32> %res
}
declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)
And that results in a rather impressive assembly:
vpxor xmm0, xmm0, xmm0
vpcmpgtd xmm1, xmm0, xmmword ptr [rdi]
vpshufb xmm3, xmm1, xmmword ptr [rip + .LCPI0_0] # xmm3 =
xmm1[8,8,8,8,8,8,8,8,12,12,12,12,12,12,12,12]
vpbroadcastb xmm0, xmm1
vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 =
xmm1[4,u,4,u,4,u,4,u,4,u,4,u,4,u,4,u]
vpmovzxwd ymm0, xmm0 # ymm0 =
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
vpmovzxbw xmm2, xmm3 # xmm2 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
vpmovzxwd ymm2, xmm2 # ymm2 =
xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
vpslld ymm0, ymm0, 31
vpmaskmovd ymm0, ymm0, ymmword ptr [rdx]
vpslld ymm2, ymm2, 31
vpmaskmovd ymm2, ymm2, ymmword ptr [rdx + 64]
vpmovzxwd ymm1, xmm1 # ymm1 =
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
vpslld ymm1, ymm1, 31
vpmaskmovd ymm1, ymm1, ymmword ptr [rdx + 32]
vpunpckhbw xmm3, xmm3, xmm3 # xmm3 =
xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
vpmovzxwd ymm3, xmm3 # ymm3 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
vpslld ymm3, ymm3, 31
vpmaskmovd ymm3, ymm3, ymmword ptr [rdx + 96]
I would think this may be better: <a href="https://godbolt.org/z/aWohrxsba">https://godbolt.org/z/aWohrxsba</a>
define <32 x i32> @mask_i32_stride8_vf4_good(<4 x i32>* %in.vec, <32 x i32>*
%out.vec, <32 x i32>*%ptr) #0 {
%src.vec = load <4 x i32>, <4 x i32>* %in.vec, align 32
%src.vec.signbitset = icmp slt <4 x i32> %src.vec, zeroinitializer
%src.vec.signbitset1 = zext <4 x i1> %src.vec.signbitset to <4 x i32>
%src.vec.signbitset2 = shl <4 x i32> %src.vec.signbitset1, <i32 31, i32 31,
i32 31, i32 31>
%tgt.vec = shufflevector <4 x i32> %src.vec.signbitset2, <4 x i32> poison,
<32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%tgt.vec.signbitset = icmp slt <32 x i32> %tgt.vec, zeroinitializer
%res = call <32 x i32> @llvm.masked.load (<32 x i32>*%ptr, i32 4, <32 x i1>
%tgt.vec.signbitset, <32 x i32> undef)
ret <32 x i32> %res
}
declare <32 x i32> @llvm.masked.load (<32 x i32>*, i32, <32 x i1>, <32 x i32>)
mask_i32_stride8_vf4_good: # @mask_i32_stride8_vf4_good
vpxor xmm0, xmm0, xmm0
vpcmpgtd xmm0, xmm0, xmmword ptr [rdi]
vpslld xmm0, xmm0, 31
vpshufd xmm1, xmm0, 255 # xmm1 = xmm0[3,3,3,3]
vpbroadcastq ymm3, xmm1
vpshufd xmm1, xmm0, 250 # xmm1 = xmm0[2,2,3,3]
vpbroadcastq ymm2, xmm1
vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
vpbroadcastd ymm0, xmm0
vpbroadcastq ymm1, xmm1
vpmaskmovd ymm3, ymm3, ymmword ptr [rdx + 96]
vpmaskmovd ymm0, ymm0, ymmword ptr [rdx]
vpmaskmovd ymm2, ymm2, ymmword ptr [rdx + 64]
vpmaskmovd ymm1, ymm1, ymmword ptr [rdx + 32]
ret</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>