[llvm] [X86] visitSelect - widen select(cond,extract_subvector(shuffle(vec0)),vec1) if it will create a mask instruction (PR #115223)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 14:01:40 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
This patch extends the existing fold "select(mask, extract_subvector(shuffle(x)), zero) --> extract_subvector(select(insert_subvector(mask), shuffle(x), zero))", to also handle the non-zero case.
I've put in a restriction for VPERMV3 3 vector operands shuffles to only work with the zero select as in most circumstances we are not selecting with either of the source vectors (the only case the mask instructions match).
We should be able to generalize this in the future to work with other maskable instructions, but this is a good initial improvement.
Fixes #<!-- -->113400
---
Patch is 44.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115223.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+5-4)
- (modified) llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (+189-138)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c15517249eb819..c0938b8b08f00d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46835,9 +46835,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
- // AVX512 - Extend select with zero to merge with target shuffle.
- // select(mask, extract_subvector(shuffle(x)), zero) -->
- // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+ // AVX512 - Extend select to merge with target shuffle.
+ // select(mask, extract_subvector(shuffle(x)), y) -->
+ // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
// TODO - support non target shuffles as well with canCombineAsMaskOperation.
if (Subtarget.hasAVX512() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1) {
@@ -46847,7 +46847,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
isNullConstant(Op.getOperand(1)) &&
TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
- ISD::isBuildVectorAllZeros(Alt.getNode());
+ (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
+ ISD::isBuildVectorAllZeros(Alt.getNode()));
};
bool SelectableLHS = SelectableOp(LHS, RHS);
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 1b9e9b200a9e3c..ffbeeb19a4aebd 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -18,10 +18,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
-; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
@@ -47,10 +48,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
-; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -76,10 +78,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
-; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
@@ -116,10 +119,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
-; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -403,10 +407,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -473,10 +478,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
-; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
@@ -502,10 +508,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
-; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
@@ -542,10 +549,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
-; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
@@ -906,10 +914,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2]
-; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
@@ -935,10 +944,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3]
-; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
@@ -1002,10 +1012,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5]
-; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
@@ -1188,10 +1199,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1215,10 +1227,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1242,10 +1255,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1279,10 +1293,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1317,10 +1332,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1346,11 +1362,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
@@ -1377,10 +1394,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
@@ -1417,10 +1435,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
@@ -1782,9 +1801,10 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
@@ -1809,9 +1829,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i
define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -1939,10 +1960,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
@@ -1983,10 +2005,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-FAST-NEXT: retq
;
; C...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/115223
More information about the llvm-commits
mailing list