[llvm] [X86] fold AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0) to support AVX512 predicated {k}{z} masks (PR #131788)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 18 04:47:25 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/131788
We already do this for the ANDNP(SEXT(SETCC()),X) equivalent pattern.
Fixes #109272
>From 9b477a7bceaf56a915db625d7baaccb90c1c8ae2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 18 Mar 2025 11:46:39 +0000
Subject: [PATCH] [X86] fold AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0) to
support AVX512 predicated {k}{z} masks
We already do this for the ANDNP(SEXT(SETCC()),X) equivalent pattern.
Fixes #109272
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++++++
llvm/test/CodeGen/X86/gfni-lzcnt.ll | 20 ++++------
llvm/test/CodeGen/X86/vector-lzcnt-512.ll | 40 ++++++++-----------
.../vector-shuffle-combining-avx512vbmi.ll | 18 +++------
4 files changed, 48 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49a8f62421f68..dc3f313462a43 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -51347,6 +51347,8 @@ static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG,
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
@@ -51481,6 +51483,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
}
+ // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
+ // to make use of predicated selects.
+ // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
+ if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
+ SDValue X, Y;
+ EVT CondVT = VT.changeVectorElementType(MVT::i1);
+ if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+ sd_match(N, m_And(m_Value(X),
+ m_OneUse(m_SExt(m_AllOf(
+ m_Value(Y), m_SpecificVT(CondVT),
+ m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
+ return DAG.getSelect(dl, VT, Y, X,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
+ }
+ }
+
// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
// avoids slow variable shift (moving shift amount to ECX etc.)
if (isOneConstant(N1) && N0->hasOneUse()) {
diff --git a/llvm/test/CodeGen/X86/gfni-lzcnt.ll b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
index e84af84b36aa9..8e48950c32cd8 100644
--- a/llvm/test/CodeGen/X86/gfni-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
@@ -360,14 +360,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; GFNIAVX512BW-LABEL: testv64i8:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
-; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
@@ -494,14 +492,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; GFNIAVX512BW-LABEL: testv64i8u:
; GFNIAVX512BW: # %bb.0:
; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
+; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
-; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; GFNIAVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; GFNIAVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
index a722a5aee873b..d35a365508d54 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -369,14 +369,12 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -455,14 +453,12 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -561,14 +557,12 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
@@ -651,14 +645,12 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 220653e99addb..7d6ca16313583 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -149,10 +149,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64
define <64 x i8> @combine_vpermi2var_v64i8_with_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) {
; CHECK-LABEL: combine_vpermi2var_v64i8_with_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vpmovb2m %zmm1, %k0
-; CHECK-NEXT: vpmovm2b %k0, %zmm1
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpmovb2m %zmm1, %k1
+; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: ret{{[l|q]}}
%perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2)
%cmp = icmp slt <64 x i8> %a1, zeroinitializer
@@ -177,19 +175,15 @@ define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask(<64 x i8> %a0) {
; X86-LABEL: combine_vpermi2var_constant_v64i8_with_mask:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
-; X86-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1
-; X86-NEXT: vpmovb2m %zmm0, %k0
-; X86-NEXT: vpmovm2b %k0, %zmm0
-; X86-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; X86-NEXT: vpmovb2m %zmm0, %k1
+; X86-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_constant_v64i8_with_mask:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
-; X64-NEXT: vpermt2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
-; X64-NEXT: vpmovb2m %zmm0, %k0
-; X64-NEXT: vpmovm2b %k0, %zmm0
-; X64-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; X64-NEXT: vpmovb2m %zmm0, %k1
+; X64-NEXT: vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: retq
%perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>, <64 x i8> %a0, <64 x i8> <i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127>)
%cmp = icmp slt <64 x i8> %a0, zeroinitializer
More information about the llvm-commits
mailing list