[llvm] [RFC][WIP][X86] Attempt to move the AVX512 VSELECT(COND, 0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG (PR #145724)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 03:07:39 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/145724
>From a9928055f4feb39f11c9227b25e8bb97ede2edc3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 25 Jun 2025 16:44:45 +0100
Subject: [PATCH 1/4] [RFC][WIP][X86] Attempt to move the AVX512 VSELECT(COND,
0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG
Initial attempt to remove the fold out of ISel to avoid regressions identified in #145473
It still doesn't handle predicate widening which might not be very pretty......
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 59 ++++--
llvm/lib/Target/X86/X86ISelLowering.cpp | 32 ++--
.../test/CodeGen/X86/extract-vselect-setcc.ll | 3 +-
llvm/test/CodeGen/X86/psubus.ll | 4 +-
llvm/test/CodeGen/X86/var-permute-128.ll | 128 ++++++-------
llvm/test/CodeGen/X86/var-permute-256.ll | 170 +++++++++---------
6 files changed, 217 insertions(+), 179 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32c7d2bfea6c2..768f033356959 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1139,24 +1139,51 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
case ISD::VSELECT: {
- // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
- EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
- if (EleVT == MVT::i1)
- break;
-
- assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
- "We can't replace VSELECT with BLENDV in vXi16!");
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT CondVT = Cond.getValueType();
+ EVT EleVT = CondVT.getVectorElementType();
SDValue R;
- if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
- EleVT.getSizeInBits()) {
- R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1), N->getOperand(2),
- CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+
+ if (EleVT == MVT::i1) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
+ if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
+ ISD::isBuildVectorAllZeros(RHS.getNode()))
+ break;
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ // vselect cond, op1, op2 = vselect not(cond), op2, op1
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
+ Cond.getOperand(1), CC);
+ } else if (Cond.getOpcode() == X86ISD::CMPM &&
+ Cond.getConstantOperandVal(2) == 0) {
+ // FLIP FCMP EQ -> (U)NE
+ R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
+ Cond.getOperand(0), Cond.getOperand(1),
+ CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
+ }
+ R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
} else {
- R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1),
- N->getOperand(2));
+ // Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+ "We can't replace VSELECT with BLENDV in vXi16!");
+ if (Subtarget->hasVLX() &&
+ CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
+ R = CurDAG->getNode(
+ X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
+ CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ Cond, LHS, RHS);
+ }
}
--I;
CurDAG->ReplaceAllUsesWith(N, R.getNode());
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c26dd6e2dc2f..da1f48ede5402 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5415,6 +5415,20 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
+ // Match not(insert_subvector(undef, setcc(), c))
+ // --> insert_subvector(undef, not(setcc()), c)
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+ V.getOperand(1).getOpcode() == ISD::SETCC &&
+ V.getValueType().getScalarType() == MVT::i1) {
+ SDValue Cond = V.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), CC);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
+ V.getOperand(0), NotSub, V.getOperand(2));
+ }
+
// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
@@ -48049,19 +48063,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- // Check if the first operand is all zeros and Cond type is vXi1.
- // If this an avx512 target we can improve the use of zero masking by
- // swapping the operands and inverting the condition.
- if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
- Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
- ISD::isBuildVectorAllZeros(LHS.getNode()) &&
- !ISD::isBuildVectorAllZeros(RHS.getNode())) {
- // Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
- }
-
// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
// get split by legalization.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
@@ -48125,11 +48126,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
- if (CondVT.getScalarType() != MVT::i1) {
+ if (CondVT.getScalarType() != MVT::i1 ||
+ (ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+ !ISD::isBuildVectorAllZeros(RHS.getNode())))
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+ if (CondVT.getScalarType() != MVT::i1) {
// select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
if (Cond.getOpcode() == X86ISD::PCMPEQ &&
Cond.getOperand(0).getOpcode() == ISD::AND &&
diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
index 96c8e773d5edd..1997323ed61a6 100644
--- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
+++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
@@ -5,7 +5,8 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
+; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
+; CHECK-NEXT: knotb %k0, %k1
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index e10b360b35b56..cc3aee4feba2d 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -981,9 +981,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpmovdb %zmm1, %xmm3
; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpsubb %xmm0, %xmm3, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 7f4111e65cc17..067d4c569d276 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
@@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
@@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
@@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 283c6a303a581..6a450e2665171 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -158,12 +158,12 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -285,23 +285,24 @@ define <8 x i32> @var_shuffle_zero_v8i32(<8 x i32> %v, <8 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
@@ -554,18 +555,18 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpcmpnleuw %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
-; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendmw %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -587,22 +588,22 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n
;
; AVX512VLBW-LABEL: var_shuffle_zero_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
+; AVX512VLBW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v16i16:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; VLVBMI-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLVBMI-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; VLVBMI-NEXT: vpcmpnleuw %ymm2, %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; VLVBMI-NEXT: vpcmpleuw %ymm2, %ymm1, %k2
+; VLVBMI-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k2} {z}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <16 x i16> %indices, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%or = select <16 x i1> %cmp, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <16 x i16> %indices
@@ -902,16 +903,16 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512BW-NEXT: vpcmpnleub %zmm2, %zmm1, %k1
-; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vpcmpleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
@@ -931,27 +932,27 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw
;
; AVX512VLBW-LABEL: var_shuffle_zero_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; AVX512VLBW-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k2
+; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512VLBW-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vpblendmb %ymm3, %ymm1, %ymm3 {%k1}
+; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
+; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %k1
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k2}
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512VLBW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 {%k1}
+; AVX512VLBW-NEXT: vpcmpleub %ymm2, %ymm1, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %ymm4, %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
;
; VLVBMI-LABEL: var_shuffle_zero_v32i8:
; VLVBMI: # %bb.0:
-; VLVBMI-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
-; VLVBMI-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; VLVBMI-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
-; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
-; VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLVBMI-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
+; VLVBMI-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; VLVBMI-NEXT: vpcmpnleub %ymm2, %ymm1, %k1
+; VLVBMI-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; VLVBMI-NEXT: vpcmpleub %ymm2, %ymm1, %k2
+; VLVBMI-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1}
+; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k2} {z}
; VLVBMI-NEXT: retq
%cmp = icmp ugt <32 x i8> %indices, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
%or = select <32 x i1> %cmp, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> %indices
@@ -1202,12 +1203,12 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices)
;
; AVX512VL-LABEL: var_shuffle_zero_v4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleuq %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleuq %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa64 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i64> %indices, <i64 3, i64 3, i64 3, i64 3>
%or = select <4 x i1> %cmp, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %indices
@@ -1329,23 +1330,24 @@ define <8 x float> @var_shuffle_zero_v8f32(<8 x float> %v, <8 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v8f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleud %ymm2, %ymm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpleud %ymm2, %ymm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %ymm3, %ymm1 {%k1}
+; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i32> %indices, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
%or = select <8 x i1> %cmp, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %indices
>From b8aa447235fd88130474211142287ba07aaf938c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 08:25:32 +0100
Subject: [PATCH 2/4] add comment to fold
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da1f48ede5402..c3ef78c404267 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48126,6 +48126,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
+ // Limit vXi1 cases to AVX512 canonicalization of zero mask to the RHS.
if (CondVT.getScalarType() != MVT::i1 ||
(ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())))
>From 74b68130faa1c069eaab3e8bd0bb73655900276b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 08:30:02 +0100
Subject: [PATCH 3/4] cleanup comment
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 768f033356959..d747cfbace989 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1151,9 +1151,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
break;
- // If this an avx512 target we can improve the use of zero masking by
+ // If this is an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
- // vselect cond, op1, op2 = vselect not(cond), op2, op1
+ // vselect cond, zero, op = vselect not(cond), op, zero
if (Cond.getOpcode() == ISD::SETCC &&
!ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
>From b015ba0987b6573806312835f8385d38e754a10b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 11:07:19 +0100
Subject: [PATCH 4/4] Move insert_subvector(undef, setcc(), c) handling into
DAGToDAG
---
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 61 +++++++++++--------
llvm/lib/Target/X86/X86ISelLowering.cpp | 58 +++++++-----------
llvm/test/CodeGen/X86/avx512-vbroadcast.ll | 6 +-
.../test/CodeGen/X86/extract-vselect-setcc.ll | 3 +-
llvm/test/CodeGen/X86/var-permute-256.ll | 16 ++---
5 files changed, 69 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d747cfbace989..23058257943f9 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1143,10 +1143,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT CondVT = Cond.getValueType();
- EVT EleVT = CondVT.getVectorElementType();
+ EVT CondSVT = CondVT.getVectorElementType();
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
SDValue R;
- if (EleVT == MVT::i1) {
+ if (CondSVT == MVT::i1) {
assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
ISD::isBuildVectorAllZeros(RHS.getNode()))
@@ -1154,35 +1156,44 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// If this is an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
// vselect cond, zero, op = vselect not(cond), op, zero
- if (Cond.getOpcode() == ISD::SETCC &&
- !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
- R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
- Cond.getOperand(1), CC);
- } else if (Cond.getOpcode() == X86ISD::CMPM &&
- Cond.getConstantOperandVal(2) == 0) {
- // FLIP FCMP EQ -> (U)NE
- R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
- Cond.getOperand(0), Cond.getOperand(1),
- CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
- } else {
- R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
- }
- R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
+ auto InverseCondition = [this](SDValue Cond, const SDLoc &DL) {
+ EVT CondVT = Cond.getValueType();
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ return CurDAG->getSetCC(DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(1), CC);
+ }
+ if (Cond.getOpcode() == X86ISD::CMPM ||
+ Cond.getOpcode() == X86ISD::FSETCCM) {
+ unsigned CC = Cond.getConstantOperandVal(2);
+ return CurDAG->getNode(
+ Cond.getOpcode(), DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(1),
+ CurDAG->getTargetConstant(CC ^ 4, DL, MVT::i8));
+ }
+ return CurDAG->getNOT(DL, Cond, CondVT);
+ };
+ if (Cond.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Cond.getOperand(0).isUndef())
+ R = CurDAG->getNode(
+ ISD::INSERT_SUBVECTOR, DL, CondVT, Cond.getOperand(0),
+ InverseCondition(Cond.getOperand(1), DL), Cond.getOperand(2));
+ else
+ R = InverseCondition(Cond, DL);
+ R = CurDAG->getSelect(DL, VT, R, RHS, LHS);
} else {
// Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+ assert(VT.getVectorElementType() != MVT::i16 &&
"We can't replace VSELECT with BLENDV in vXi16!");
if (Subtarget->hasVLX() &&
- CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
- R = CurDAG->getNode(
- X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
- CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+ CurDAG->ComputeNumSignBits(Cond) == CondSVT.getSizeInBits()) {
+ R = CurDAG->getNode(X86ISD::VPTERNLOG, DL, VT, Cond, LHS, RHS,
+ CurDAG->getTargetConstant(0xCA, DL, MVT::i8));
} else {
- R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- Cond, LHS, RHS);
+ R = CurDAG->getNode(X86ISD::BLENDV, DL, VT, Cond, LHS, RHS);
}
}
--I;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c3ef78c404267..7aaa6db074f0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5415,20 +5415,6 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
- // Match not(insert_subvector(undef, setcc(), c))
- // --> insert_subvector(undef, not(setcc()), c)
- if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
- V.getOperand(1).getOpcode() == ISD::SETCC &&
- V.getValueType().getScalarType() == MVT::i1) {
- SDValue Cond = V.getOperand(1);
- ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
- SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
- Cond.getOperand(0), Cond.getOperand(1), CC);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
- V.getOperand(0), NotSub, V.getOperand(2));
- }
-
// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
@@ -48134,29 +48120,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
- if (CondVT.getScalarType() != MVT::i1) {
- // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
- if (Cond.getOpcode() == X86ISD::PCMPEQ &&
- Cond.getOperand(0).getOpcode() == ISD::AND &&
- ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
- isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
- Cond.getScalarValueSizeInBits(),
- /*AllowUndefs=*/true) &&
- Cond.hasOneUse()) {
- Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
- Cond.getOperand(0).getOperand(1));
- return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
- }
-
- // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
- // signbit.
- if (Cond.getOpcode() == X86ISD::PCMPGT &&
- ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
- Cond.hasOneUse()) {
- Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
- DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
- return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
- }
+ // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
+ if (Cond.getOpcode() == X86ISD::PCMPEQ &&
+ Cond.getOperand(0).getOpcode() == ISD::AND &&
+ ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
+ isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
+ Cond.getScalarValueSizeInBits(),
+ /*AllowUndefs=*/true) &&
+ Cond.hasOneUse()) {
+ Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
+ Cond.getOperand(0).getOperand(1));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+ }
+
+ // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
+ // signbit.
+ if (Cond.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
+ Cond.hasOneUse()) {
+ Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+ DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
}
// Try to optimize vXi1 selects if both operands are either all constants or
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
index 275df8c3675c0..4d4f86b0c3f82 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -210,9 +210,9 @@ define <16 x i32> @test_vbroadcast(<16 x float> %a0) {
; ALL: # %bb.0: # %entry
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vcmpunordps %zmm1, %zmm0, %k1
-; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; ALL-NEXT: knotw %k1, %k1
-; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; ALL-NEXT: vcmpordps %zmm1, %zmm0, %k1
+; ALL-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z}
; ALL-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
index 1997323ed61a6..96c8e773d5edd 100644
--- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
+++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
@@ -5,8 +5,7 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
-; CHECK-NEXT: knotb %k0, %k1
+; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 6a450e2665171..abc859d3f156d 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -148,11 +148,11 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
-; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpermq %zmm0, %zmm3, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
@@ -1193,11 +1193,11 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices)
; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
-; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpermpd %zmm0, %zmm3, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
;
More information about the llvm-commits
mailing list