[llvm] [X86] LowerMINMAX - use valuetracking to attempt to find a smaller type that can efficiently lower min/max ops (PR #174294)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 3 13:56:07 PST 2026
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/174294
We currently use the generic expansions to custom lower integer min/max instructions, but if we have sufficient leading bits, SSE/AVX is always better off handling it directly with smaller types.
vXi64 cmp/min/max is particularly weak, and as we narrow the types the better legality we have - this approach seems to work well for x86, but I'm not sure if its valid enough to try generically in this manner.
However, I added the signed/unsigned flip fold to expandIntMINMAX to further improve SSE2 codegen, similar to what we already attempt in DAGCombiner (which with a bit more work we might be able to remove now).
All thats missing is better ComputeNumSignBits handling for vXi64 ashr expansion, which still misses a lot of cases when split across vXi32 types and shuffles.
Fixes #174169
>From ef192ca13a1b7e6d9fe4f0a6a9c9c161ce3d7193 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sat, 3 Jan 2026 21:54:00 +0000
Subject: [PATCH] [X86] LowerMINMAX - use valuetracking to attempt to find a
smaller type that can efficiently lower min/max ops
We currently use the generic expansions to custom lower integer min/max instructions, but if we have sufficient leading bits, SSE/AVX is always better off handling it directly with smaller types.
vXi64 cmp/min/max is particularly weak, and as we narrow the types the better legality we have - this approach seems to work well for x86, but I'm not sure if its valid enough to try generically.
However, I added the signed/unsigned flip fold to expandIntMINMAX to further improve SSE2 codegen, similar to what we already attempt in DAGCombiner (which with a bit more work we might be able to remove now).
All thats missing is better ComputeNumSignBits handling for vXi64 ashr expansion, which still misses a lot of cases when split across vXi32 types and shuffles.
Fixes #174169
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 22 +++++
llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +++++++
llvm/test/CodeGen/X86/vector-pcmp.ll | 26 ++----
llvm/test/CodeGen/X86/vector-smax-range.ll | 12 +--
llvm/test/CodeGen/X86/vector-smin-range.ll | 12 +--
llvm/test/CodeGen/X86/vector-umax-range.ll | 91 +++++-------------
llvm/test/CodeGen/X86/vector-umin-range.ll | 93 ++++++-------------
7 files changed, 118 insertions(+), 170 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e3300000fa6f4..69c3455573918 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10799,6 +10799,28 @@ SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();
SDLoc DL(Node);
+ // If both sign bits are zero, flip UMIN/UMAX <-> SMIN/SMAX if legal.
+ unsigned AltOpcode;
+ switch (Opcode) {
+ case ISD::SMIN:
+ AltOpcode = ISD::UMIN;
+ break;
+ case ISD::SMAX:
+ AltOpcode = ISD::UMAX;
+ break;
+ case ISD::UMIN:
+ AltOpcode = ISD::SMIN;
+ break;
+ case ISD::UMAX:
+ AltOpcode = ISD::SMAX;
+ break;
+ default:
+ llvm_unreachable("Unknown MINMAX opcode");
+ }
+ if (isOperationLegal(AltOpcode, VT) && DAG.SignBitIsZero(Op0) &&
+ DAG.SignBitIsZero(Op1))
+ return DAG.getNode(AltOpcode, DL, VT, Op0, Op1);
+
// umax(x,1) --> sub(x,cmpeq(x,0)) iff cmp result is allbits
if (Opcode == ISD::UMAX && llvm::isOneOrOneSplat(Op1, true) && BoolVT == VT &&
getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fed4df707400f..602b026def45b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29546,6 +29546,10 @@ static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ bool IsMax = Op.getOpcode() == ISD::SMAX || Op.getOpcode() == ISD::UMAX;
+ bool IsSigned = Op.getOpcode() == ISD::SMAX || Op.getOpcode() == ISD::SMIN;
SDLoc DL(Op);
// For AVX1 cases, split to use legal ops.
@@ -29555,6 +29559,34 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG, DL);
+ // See if the vector elements have sufficient leading bits to allow a
+ // smaller minmax opcode to be used.
+ if (VT.isVector() && EltSizeInBits > 8) {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ unsigned CLS = DAG.ComputeNumSignBits(N0);
+ if (CLS != 1)
+ CLS = std::min(CLS, DAG.ComputeNumSignBits(N1));
+ unsigned CLZ = DAG.computeKnownBits(N0).countMinLeadingZeros();
+ if (CLZ != 0)
+ CLZ = std::min(CLZ, DAG.computeKnownBits(N1).countMinLeadingZeros());
+ for (unsigned Bits = 8; Bits < EltSizeInBits; Bits += Bits) {
+ std::optional<unsigned> Opcode;
+ if (CLZ >= (EltSizeInBits - Bits)) {
+ Opcode = IsMax ? ISD::UMAX : ISD::UMIN;
+ } else if ((IsSigned ? CLS : CLZ) > (EltSizeInBits - Bits)) {
+ Opcode = IsMax ? ISD::SMAX : ISD::SMIN;
+ }
+ if (Opcode.has_value()) {
+ MVT ReducedSVT = MVT::getIntegerVT(Bits);
+ MVT ReducedVT = MVT::getVectorVT(ReducedSVT, SizeInBits / Bits);
+ return DAG.getBitcast(VT, DAG.getNode(*Opcode, DL, ReducedVT,
+ DAG.getBitcast(ReducedVT, N0),
+ DAG.getBitcast(ReducedVT, N1)));
+ }
+ }
+ }
+
// Default to expand.
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 30eb2279bda85..7e42c73f86315 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1961,14 +1961,11 @@ define <4 x i64> @PR52504(<4 x i16> %t3) {
; SSE42-LABEL: PR52504:
; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE42-NEXT: pmovsxwq %xmm1, %xmm2
-; SSE42-NEXT: pmovsxwq %xmm0, %xmm3
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT: por %xmm3, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
-; SSE42-NEXT: por %xmm2, %xmm1
+; SSE42-NEXT: pmovsxwq %xmm1, %xmm1
+; SSE42-NEXT: pmovsxwq %xmm0, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; SSE42-NEXT: pmaxsw %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: PR52504:
@@ -1976,20 +1973,17 @@ define <4 x i64> @PR52504(<4 x i16> %t3) {
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR52504:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR52504:
diff --git a/llvm/test/CodeGen/X86/vector-smax-range.ll b/llvm/test/CodeGen/X86/vector-smax-range.ll
index 56b41dad5049a..aa15fa25068af 100644
--- a/llvm/test/CodeGen/X86/vector-smax-range.ll
+++ b/llvm/test/CodeGen/X86/vector-smax-range.ll
@@ -11,11 +11,7 @@ define <4 x i32> @smax_v4i32_as_umax_v4i8(<4 x i32> %a0, <4 x i32> %a1) nounwind
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $30, %xmm0
; SSE2-NEXT: psrld $29, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: smax_v4i32_as_umax_v4i8:
@@ -42,11 +38,7 @@ define <4 x i32> @smax_v4i32_as_smax_v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: smax_v4i32_as_smax_v4i16:
diff --git a/llvm/test/CodeGen/X86/vector-smin-range.ll b/llvm/test/CodeGen/X86/vector-smin-range.ll
index a3c58ea5b4517..ce68a030fc185 100644
--- a/llvm/test/CodeGen/X86/vector-smin-range.ll
+++ b/llvm/test/CodeGen/X86/vector-smin-range.ll
@@ -11,11 +11,7 @@ define <4 x i32> @smin_v4i32_as_umin_v4i8(<4 x i32> %a0, <4 x i32> %a1) nounwind
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $30, %xmm0
; SSE2-NEXT: psrld $29, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pminub %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: smin_v4i32_as_umin_v4i8:
@@ -42,11 +38,7 @@ define <4 x i32> @smin_v4i32_as_smin_v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: smin_v4i32_as_smin_v4i16:
diff --git a/llvm/test/CodeGen/X86/vector-umax-range.ll b/llvm/test/CodeGen/X86/vector-umax-range.ll
index c1a0ad2c29932..a0ff1de55fc3d 100644
--- a/llvm/test/CodeGen/X86/vector-umax-range.ll
+++ b/llvm/test/CodeGen/X86/vector-umax-range.ll
@@ -38,11 +38,7 @@ define <4 x i32> @umax_v4i32_as_umax_v4i8(<4 x i32> %a0, <4 x i32> %a1) nounwind
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $30, %xmm0
; SSE2-NEXT: psrld $29, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmaxub %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umax_v4i32_as_umax_v4i8:
@@ -69,11 +65,8 @@ define <4 x i32> @umax_v4i32_as_umax_v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umax_v4i32_as_umax_v4i16:
@@ -100,39 +93,28 @@ define <2 x i64> @umax_v2i64_as_umax_v2i16(<2 x i64> %a0, <2 x i64> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrlq $49, %xmm0
; SSE2-NEXT: psrlq $63, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pmaxsw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umax_v2i64_as_umax_v2i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $49, %xmm2
+; SSE42-NEXT: psrlq $49, %xmm0
; SSE42-NEXT: psrlq $63, %xmm1
-; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: pmaxuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: umax_v2i64_as_umax_v2i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: umax_v2i64_as_umax_v2i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $49, %xmm0, %xmm0
; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umax_v2i64_as_umax_v2i16:
@@ -162,7 +144,6 @@ define <2 x i64> @umax_v2i64_as_umax_v2i32(<2 x i64> %a0, <2 x i64> %a1) nounwin
; SSE2-NEXT: psrlq $43, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
@@ -170,29 +151,23 @@ define <2 x i64> @umax_v2i64_as_umax_v2i32(<2 x i64> %a0, <2 x i64> %a1) nounwin
;
; SSE42-LABEL: umax_v2i64_as_umax_v2i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $33, %xmm2
+; SSE42-NEXT: psrlq $33, %xmm0
; SSE42-NEXT: psrlq $43, %xmm1
-; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: pmaxud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: umax_v2i64_as_umax_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $43, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: umax_v2i64_as_umax_v2i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX2-NEXT: vpsrlq $43, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umax_v2i64_as_umax_v2i32:
@@ -221,35 +196,20 @@ define <4 x i64> @umax_v4i64_as_umax_v4i16(<4 x i64> %a0) nounwind {
; SSE2-NEXT: psrlq $48, %xmm1
; SSE2-NEXT: psrlq $48, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65530,65530]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm3
+; SSE2-NEXT: paddw %xmm3, %xmm0
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: umax_v4i64_as_umax_v4i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $48, %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [65530,65530]
-; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT: movdqa %xmm3, %xmm4
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE42-NEXT: psrlq $48, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE42-NEXT: movapd %xmm4, %xmm0
-; SSE42-NEXT: movapd %xmm3, %xmm1
+; SSE42-NEXT: psrlq $48, %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [65530,65530]
+; SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; SSE42-NEXT: pmaxuw %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: umax_v4i64_as_umax_v4i16:
@@ -258,10 +218,8 @@ define <4 x i64> @umax_v4i64_as_umax_v4i16(<4 x i64> %a0) nounwind {
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -269,8 +227,7 @@ define <4 x i64> @umax_v4i64_as_umax_v4i16(<4 x i64> %a0) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $48, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65530,65530,65530,65530]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umax_v4i64_as_umax_v4i16:
diff --git a/llvm/test/CodeGen/X86/vector-umin-range.ll b/llvm/test/CodeGen/X86/vector-umin-range.ll
index 6365695f5995a..0cf816cc35449 100644
--- a/llvm/test/CodeGen/X86/vector-umin-range.ll
+++ b/llvm/test/CodeGen/X86/vector-umin-range.ll
@@ -38,11 +38,7 @@ define <4 x i32> @umin_v4i32_as_umin_v4i8(<4 x i32> %a0, <4 x i32> %a1) nounwind
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $30, %xmm0
; SSE2-NEXT: psrld $29, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pminub %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umin_v4i32_as_umin_v4i8:
@@ -69,11 +65,9 @@ define <4 x i32> @umin_v4i32_as_umin_v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umin_v4i32_as_umin_v4i16:
@@ -100,39 +94,28 @@ define <2 x i64> @umin_v2i64_as_umin_v2i16(<2 x i64> %a0, <2 x i64> %a1) nounwin
; SSE2: # %bb.0:
; SSE2-NEXT: psrlq $49, %xmm0
; SSE2-NEXT: psrlq $63, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pminsw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: umin_v2i64_as_umin_v2i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $49, %xmm2
+; SSE42-NEXT: psrlq $49, %xmm0
; SSE42-NEXT: psrlq $63, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: pminuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: umin_v2i64_as_umin_v2i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: umin_v2i64_as_umin_v2i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $49, %xmm0, %xmm0
; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umin_v2i64_as_umin_v2i16:
@@ -162,7 +145,6 @@ define <2 x i64> @umin_v2i64_as_umin_v2i32(<2 x i64> %a0, <2 x i64> %a1) nounwin
; SSE2-NEXT: psrlq $43, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
@@ -170,29 +152,23 @@ define <2 x i64> @umin_v2i64_as_umin_v2i32(<2 x i64> %a0, <2 x i64> %a1) nounwin
;
; SSE42-LABEL: umin_v2i64_as_umin_v2i32:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $33, %xmm2
+; SSE42-NEXT: psrlq $33, %xmm0
; SSE42-NEXT: psrlq $43, %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: pminud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: umin_v2i64_as_umin_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX1-NEXT: vpsrlq $43, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: umin_v2i64_as_umin_v2i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $33, %xmm0, %xmm0
; AVX2-NEXT: vpsrlq $43, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umin_v2i64_as_umin_v2i32:
@@ -221,35 +197,21 @@ define <4 x i64> @umin_v4i64_as_umin_v4i16(<4 x i64> %a0) nounwind {
; SSE2-NEXT: psrlq $48, %xmm1
; SSE2-NEXT: psrlq $48, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65530,65530]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psubusw %xmm2, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: umin_v4i64_as_umin_v4i16:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm2
-; SSE42-NEXT: psrlq $48, %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [65530,65530]
-; SSE42-NEXT: movdqa %xmm3, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: movdqa %xmm3, %xmm4
-; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE42-NEXT: psrlq $48, %xmm1
-; SSE42-NEXT: movdqa %xmm3, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE42-NEXT: movapd %xmm4, %xmm0
-; SSE42-NEXT: movapd %xmm3, %xmm1
+; SSE42-NEXT: psrlq $48, %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [65530,65530]
+; SSE42-NEXT: pminuw %xmm2, %xmm0
+; SSE42-NEXT: pminuw %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: umin_v4i64_as_umin_v4i16:
@@ -258,10 +220,8 @@ define <4 x i64> @umin_v4i64_as_umin_v4i16(<4 x i64> %a0) nounwind {
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -269,8 +229,7 @@ define <4 x i64> @umin_v4i64_as_umin_v4i16(<4 x i64> %a0) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $48, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65530,65530,65530,65530]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: umin_v4i64_as_umin_v4i16:
More information about the llvm-commits
mailing list