[llvm] cc21aa1 - [X86] lower1BitShuffle - fold permute(setcc(x, y)) -> setcc(permute(x),permute(y)) for 32/64-bit element vectors
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 10 04:36:07 PST 2024
Author: Simon Pilgrim
Date: 2024-01-10T12:35:50Z
New Revision: cc21aa1922b3d0c4fde52046d8d16d1048f8064e
URL: https://github.com/llvm/llvm-project/commit/cc21aa1922b3d0c4fde52046d8d16d1048f8064e
DIFF: https://github.com/llvm/llvm-project/commit/cc21aa1922b3d0c4fde52046d8d16d1048f8064e.diff
LOG: [X86] lower1BitShuffle - fold permute(setcc(x,y)) -> setcc(permute(x),permute(y)) for 32/64-bit element vectors
Noticed in #77459 - for wider element types, its usually better to pre-shuffle the comparison arguments if we can, like we already for broadcasts
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pr77459.ll
llvm/test/CodeGen/X86/vector-shuffle-v1.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6da137426c56f0..5f6f500e49dd2a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17224,6 +17224,7 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
"Cannot lower 512-bit vectors w/o basic ISA!");
int NumElts = Mask.size();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
// Try to recognize shuffles that are just padding a subvector with zeros.
int SubvecElts = 0;
@@ -17289,17 +17290,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Offset += NumElts; // Increment for next iteration.
}
- // If we're broadcasting a SETCC result, try to broadcast the ops instead.
+ // If we're performing an unary shuffle on a SETCC result, try to shuffle the
+ // ops instead.
// TODO: What other unary shuffles would benefit from this?
- if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
- V1->hasOneUse()) {
+ if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
SDValue Op0 = V1.getOperand(0);
SDValue Op1 = V1.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
EVT OpVT = Op0.getValueType();
- return DAG.getSetCC(
- DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
- DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
+ if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
+ return DAG.getSetCC(
+ DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
+ DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
}
MVT ExtVT;
diff --git a/llvm/test/CodeGen/X86/pr77459.ll b/llvm/test/CodeGen/X86/pr77459.ll
index cf073e97137eb8..9c072e6f5e3fcf 100644
--- a/llvm/test/CodeGen/X86/pr77459.ll
+++ b/llvm/test/CodeGen/X86/pr77459.ll
@@ -42,10 +42,9 @@ define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
;
; AVX512-LABEL: reverse_cmp_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 809d94b649fb48..6ef203999af6ed 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -9,8 +9,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_0:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -21,19 +19,15 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512VL-LABEL: shuf2i1_1_0:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
@@ -86,10 +80,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; AVX512F-LABEL: shuf4i1_3_2_10:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -98,21 +90,17 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
;
; AVX512VL-LABEL: shuf4i1_3_2_10:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # %bb.0:
-; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
-; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: retq
@@ -123,11 +111,10 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -136,12 +123,11 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
;
; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
-; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -149,11 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
;
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; VL_BW_DQ: # %bb.0:
+; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2
+; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
-; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
-; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
More information about the llvm-commits
mailing list