[llvm] [X86] Move the AVX512 VSELECT(COND, 0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG (PR #145724)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 03:39:14 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/145724
>From a9928055f4feb39f11c9227b25e8bb97ede2edc3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 25 Jun 2025 16:44:45 +0100
Subject: [PATCH 1/5] [RFC][WIP][X86] Attempt to move the AVX512 VSELECT(COND,
0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG
Initial attempt to remove the fold out of ISel to avoid regressions identified in #145473
It still doesn't handle predicate widening which might not be very pretty......
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 59 ++++--
llvm/lib/Target/X86/X86ISelLowering.cpp | 32 ++--
.../test/CodeGen/X86/extract-vselect-setcc.ll | 3 +-
llvm/test/CodeGen/X86/psubus.ll | 4 +-
llvm/test/CodeGen/X86/var-permute-128.ll | 128 ++++++-------
llvm/test/CodeGen/X86/var-permute-256.ll | 170 +++++++++---------
6 files changed, 217 insertions(+), 179 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32c7d2bfea6c2..768f033356959 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1139,24 +1139,51 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
case ISD::VSELECT: {
- // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
- EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
- if (EleVT == MVT::i1)
- break;
-
- assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
- "We can't replace VSELECT with BLENDV in vXi16!");
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT CondVT = Cond.getValueType();
+ EVT EleVT = CondVT.getVectorElementType();
SDValue R;
- if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
- EleVT.getSizeInBits()) {
- R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1), N->getOperand(2),
- CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+
+ if (EleVT == MVT::i1) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
+ if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
+ ISD::isBuildVectorAllZeros(RHS.getNode()))
+ break;
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ // vselect cond, op1, op2 = vselect not(cond), op2, op1
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
+ Cond.getOperand(1), CC);
+ } else if (Cond.getOpcode() == X86ISD::CMPM &&
+ Cond.getConstantOperandVal(2) == 0) {
+ // FLIP FCMP EQ -> (U)NE
+ R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
+ Cond.getOperand(0), Cond.getOperand(1),
+ CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
+ }
+ R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
} else {
- R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1),
- N->getOperand(2));
+ // Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+ "We can't replace VSELECT with BLENDV in vXi16!");
+ if (Subtarget->hasVLX() &&
+ CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
+ R = CurDAG->getNode(
+ X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
+ CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ Cond, LHS, RHS);
+ }
}
--I;
CurDAG->ReplaceAllUsesWith(N, R.getNode());
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c26dd6e2dc2f..da1f48ede5402 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5415,6 +5415,20 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
+ // Match not(insert_subvector(undef, setcc(), c))
+ // --> insert_subvector(undef, not(setcc()), c)
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+ V.getOperand(1).getOpcode() == ISD::SETCC &&
+ V.getValueType().getScalarType() == MVT::i1) {
+ SDValue Cond = V.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), CC);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
+ V.getOperand(0), NotSub, V.getOperand(2));
+ }
+
// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
@@ -48049,19 +48063,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- // Check if the first operand is all zeros and Cond type is vXi1.
- // If this an avx512 target we can improve the use of zero masking by
- // swapping the operands and inverting the condition.
- if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
- Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
- ISD::isBuildVectorAllZeros(LHS.getNode()) &&
- !ISD::isBuildVectorAllZeros(RHS.getNode())) {
- // Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
- }
-
// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
// get split by legalization.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
@@ -48125,11 +48126,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
- if (CondVT.getScalarType() != MVT::i1) {
+ if (CondVT.getScalarType() != MVT::i1 ||
+ (ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+ !ISD::isBuildVectorAllZeros(RHS.getNode())))
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+ if (CondVT.getScalarType() != MVT::i1) {
// select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
if (Cond.getOpcode() == X86ISD::PCMPEQ &&
Cond.getOperand(0).getOpcode() == ISD::AND &&
diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
index 96c8e773d5edd..1997323ed61a6 100644
--- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
+++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
@@ -5,7 +5,8 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
+; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
+; CHECK-NEXT: knotb %k0, %k1
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index e10b360b35b56..cc3aee4feba2d 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -981,9 +981,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpmovdb %zmm1, %xmm3
; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpsubb %xmm0, %xmm3, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 7f4111e65cc17..067d4c569d276 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
@@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
@@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
@@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 283c6a303a581..6a450e2665171 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -158,12 +158,12 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -285,23 +285,24 @@ define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
@@ -554,18 +555,18 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendmw %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -587,22 +588,22 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
;
; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
+; AVX512VLBW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v16i16:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; VLVBMI-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLVBMI-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; VLVBMI-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; VLVBMI-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
+; VLVBMI-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
@@ -902,16 +903,16 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -931,27 +932,27 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
;
; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
+; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vpblendmb %ymm3, %ymm1, %ymm3 {%k1}
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
+; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %k1
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k2}
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 {%k1}
+; AVX512VLBW-NEXT: vpcmpleub %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm4, %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v32i8:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; VLVBMI-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
-; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLVBMI-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; VLVBMI-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; VLVBMI-NEXT: vpcmpleub %ymm2, %ymm1, %k2
+; VLVBMI-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k2} {z}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
%or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
@@ -1202,12 +1203,12 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices)
;
; AVX512VL-LABEL: var_shuffle_zero_v4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -1329,23 +1330,24 @@ define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v8f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
>From b8aa447235fd88130474211142287ba07aaf938c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 08:25:32 +0100
Subject: [PATCH 2/5] add comment to fold
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da1f48ede5402..c3ef78c404267 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48126,6 +48126,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
+ // Limit vXi1 cases to AVX512 canonicalization of zero mask to the RHS.
if (CondVT.getScalarType() != MVT::i1 ||
(ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())))
>From 74b68130faa1c069eaab3e8bd0bb73655900276b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 08:30:02 +0100
Subject: [PATCH 3/5] cleanup comment
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 768f033356959..d747cfbace989 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1151,9 +1151,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
break;
- // If this an avx512 target we can improve the use of zero masking by
+ // If this is an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
- // vselect cond, op1, op2 = vselect not(cond), op2, op1
+ // vselect cond, zero, op = vselect not(cond), op, zero
if (Cond.getOpcode() == ISD::SETCC &&
!ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
>From b015ba0987b6573806312835f8385d38e754a10b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 11:07:19 +0100
Subject: [PATCH 4/5] Move insert_subvector(undef, setcc(), c) handling into
DAGToDAG
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 61 +++++++++++--------
llvm/lib/Target/X86/X86ISelLowering.cpp | 58 +++++++-----------
llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 6 +-
.../test/CodeGen/X86/extract-vselect-setcc.ll | 3 +-
llvm/test/CodeGen/X86/var-permute-256.ll | 16 ++---
5 files changed, 69 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d747cfbace989..23058257943f9 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1143,10 +1143,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT CondVT = Cond.getValueType();
- EVT EleVT = CondVT.getVectorElementType();
+ EVT CondSVT = CondVT.getVectorElementType();
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
SDValue R;
- if (EleVT == MVT::i1) {
+ if (CondSVT == MVT::i1) {
assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
@@ -1154,35 +1156,44 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// If this is an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
// vselect cond, zero, op = vselect not(cond), op, zero
- if (Cond.getOpcode() == ISD::SETCC &&
- !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
- R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
- Cond.getOperand(1), CC);
- } else if (Cond.getOpcode() == X86ISD::CMPM &&
- Cond.getConstantOperandVal(2) == 0) {
- // FLIP FCMP EQ -> (U)NE
- R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
- Cond.getOperand(0), Cond.getOperand(1),
- CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
- } else {
- R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
- }
- R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
+ auto InverseCondition = [this](SDValue Cond, const SDLoc &DL) {
+ EVT CondVT = Cond.getValueType();
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ return CurDAG->getSetCC(DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(1), CC);
+ }
+ if (Cond.getOpcode() == X86ISD::CMPM ||
+ Cond.getOpcode() == X86ISD::FSETCCM) {
+ unsigned CC = Cond.getConstantOperandVal(2);
+ return CurDAG->getNode(
+ Cond.getOpcode(), DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(1),
+ CurDAG->getTargetConstant(CC ^ 4, DL, MVT::i8));
+ }
+ return CurDAG->getNOT(DL, Cond, CondVT);
+ };
+ if (Cond.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Cond.getOperand(0).isUndef())
+ R = CurDAG->getNode(
+ ISD::INSERT_SUBVECTOR, DL, CondVT, Cond.getOperand(0),
+ InverseCondition(Cond.getOperand(1), DL), Cond.getOperand(2));
+ else
+ R = InverseCondition(Cond, DL);
+ R = CurDAG->getSelect(DL, VT, R, RHS, LHS);
} else {
// Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+ assert(VT.getVectorElementType() != MVT::i16 &&
"We can't replace VSELECT with BLENDV in vXi16!");
if (Subtarget->hasVLX() &&
- CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
- R = CurDAG->getNode(
- X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
- CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+ CurDAG->ComputeNumSignBits(Cond) == CondSVT.getSizeInBits()) {
+ R = CurDAG->getNode(X86ISD::VPTERNLOG, DL, VT, Cond, LHS, RHS,
+ CurDAG->getTargetConstant(0xCA, DL, MVT::i8));
} else {
- R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- Cond, LHS, RHS);
+ R = CurDAG->getNode(X86ISD::BLENDV, DL, VT, Cond, LHS, RHS);
}
}
--I;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c3ef78c404267..7aaa6db074f0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5415,20 +5415,6 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
- // Match not(insert_subvector(undef, setcc(), c))
- // --> insert_subvector(undef, not(setcc()), c)
- if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
- V.getOperand(1).getOpcode() == ISD::SETCC &&
- V.getValueType().getScalarType() == MVT::i1) {
- SDValue Cond = V.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
- SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
- Cond.getOperand(0), Cond.getOperand(1), CC);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
- V.getOperand(0), NotSub, V.getOperand(2));
- }
-
// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
@@ -48134,29 +48120,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
- if (CondVT.getScalarType() != MVT::i1) {
- // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
- if (Cond.getOpcode() == X86ISD::PCMPEQ &&
- Cond.getOperand(0).getOpcode() == ISD::AND &&
- ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
- isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
- Cond.getScalarValueSizeInBits(),
- /*AllowUndefs=*/true) &&
- Cond.hasOneUse()) {
- Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
- Cond.getOperand(0).getOperand(1));
- return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
- }
-
- // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
- // signbit.
- if (Cond.getOpcode() == X86ISD::PCMPGT &&
- ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
- Cond.hasOneUse()) {
- Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
- DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
- return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
- }
+ // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
+ if (Cond.getOpcode() == X86ISD::PCMPEQ &&
+ Cond.getOperand(0).getOpcode() == ISD::AND &&
+ ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
+ isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
+ Cond.getScalarValueSizeInBits(),
+ /*AllowUndefs=*/true) &&
+ Cond.hasOneUse()) {
+ Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(0).getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+ }
+
+ // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
+ // signbit.
+ if (Cond.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
+ Cond.hasOneUse()) {
+ Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+ DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
}
// Try to optimize vXi1 selects if both operands are either all constants or
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
index 275df8c3675c0..4d4f86b0c3f82 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -210,9 +210,9 @@ define <16 x i32> @test_vbroadcast(<16 x float> %a0) {
; ALL: # %bb.0: # %entry
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcmpunordps %zmm1, %zmm0, %k1
-; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; ALL-NEXT: knotw %k1, %k1
-; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; ALL-NEXT: vcmpordps %zmm1, %zmm0, %k1
+; ALL-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z}
; ALL-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
index 1997323ed61a6..96c8e773d5edd 100644
--- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
+++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
@@ -5,8 +5,7 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
-; CHECK-NEXT: knotb %k0, %k1
+; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 6a450e2665171..abc859d3f156d 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -148,11 +148,11 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
-; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpermq %zmm0, %zmm3, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
@@ -1193,11 +1193,11 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices)
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
-; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpermpd %zmm0, %zmm3, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
>From 180b54f1be34a7ee08f7b6932a1cd7177ed73a36 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 11:15:46 +0100
Subject: [PATCH 5/5] Add back hasOneUse condition limit now we have better
FSETCC handling
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +-
llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 6 +-
llvm/test/CodeGen/X86/var-permute-128.ll | 128 ++++++++--------
llvm/test/CodeGen/X86/var-permute-256.ll | 170 ++++++++++-----------
4 files changed, 150 insertions(+), 156 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 23058257943f9..c5b9a0a2ef057 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1150,7 +1150,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (CondSVT == MVT::i1) {
assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
- if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
+ if (!Cond->hasOneUse() || !ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
break;
// If this is an avx512 target we can improve the use of zero masking by
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
index 4d4f86b0c3f82..c50418feff6fb 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -210,9 +210,9 @@ define <16 x i32> @test_vbroadcast(<16 x float> %a0) {
; ALL: # %bb.0: # %entry
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcmpunordps %zmm1, %zmm0, %k1
-; ALL-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; ALL-NEXT: vcmpordps %zmm1, %zmm0, %k1
-; ALL-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; ALL-NEXT: knotw %k1, %k1
+; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 067d4c569d276..7f4111e65cc17 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -170,26 +170,25 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
-; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -356,26 +355,24 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
@@ -603,12 +600,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
-; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
@@ -926,12 +923,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
-; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
-; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
@@ -1142,25 +1139,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
-; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -1327,25 +1324,24 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
-; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
-; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index abc859d3f156d..ab2ffdfd0ff2c 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -158,12 +158,12 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
-; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
-; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -285,24 +285,23 @@ define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
-; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
@@ -555,18 +554,18 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512BW-NEXT: vpblendmw %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpleuw %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -588,22 +587,22 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
;
; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
-; AVX512VLBW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
+; AVX512VLBW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v16i16:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; VLVBMI-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; VLVBMI-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
-; VLVBMI-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
+; VLVBMI-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
@@ -903,16 +902,16 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpleub %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -932,27 +931,27 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
;
; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vpblendmb %ymm3, %ymm1, %ymm3 {%k1}
-; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
-; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %k1
+; AVX512VLBW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 {%k1}
-; AVX512VLBW-NEXT: vpcmpleub %ymm2, %ymm1, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %ymm4, %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k2}
+; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
+; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v32i8:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; VLVBMI-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; VLVBMI-NEXT: vpcmpleub %ymm2, %ymm1, %k2
-; VLVBMI-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k2} {z}
+; VLVBMI-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; VLVBMI-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLVBMI-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
%or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
@@ -1203,12 +1202,12 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices)
;
; AVX512VL-LABEL: var_shuffle_zero_v4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
-; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
-; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -1330,24 +1329,23 @@ define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v8f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
-; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k2} {z}
+; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
More information about the llvm-commits
mailing list