[llvm] aa754f7 - [IR] llvm::createMinMaxOp - create integer min/max intrinsics instead of icmp/sel

Thu Apr 13 08:41:08 PDT 2023

Author: Simon Pilgrim
Date: 2023-04-13T16:40:43+01:00
New Revision: aa754f7e0fddcc6c4d6737c71baf357825b09a3e

URL: https://github.com/llvm/llvm-project/commit/aa754f7e0fddcc6c4d6737c71baf357825b09a3e
DIFF: https://github.com/llvm/llvm-project/commit/aa754f7e0fddcc6c4d6737c71baf357825b09a3e.diff

LOG: [IR] llvm::createMinMaxOp - create integer min/max intrinsics instead of icmp/sel

Based off D148215, when expanding a min/max reduction we should be creating min/max intrinsics directly instead of relying on instcombine to fold them back together.

This patch handles integer min/max cases. Hopefully we can add floating point support soon (at least for fastmath/nnan cases) - but we're missing some of the plumbing to pass the correct FMF to the intrinsic at the moment.

Differential Revision: https://reviews.llvm.org/D148221

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Utils/LoopUtils.h
    llvm/lib/Transforms/Utils/LoopUtils.cpp
    llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
    llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 56adb7f72a18..8921b24bc4f5 100644

--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -353,6 +353,9 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         SinkAndHoistLICMFlags &LICMFlags,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Returns the min/max intrinsic used when expanding a min/max reduction.
+Intrinsic::ID getMinMaxReductionIntrinsicOp(RecurKind RK);
+
 /// Returns the comparison predicate used when expanding a min/max reduction.
 CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
 

diff  --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 31872c15b033..90c9396e11bd 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -893,6 +893,25 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
+Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max recurrence kind");
+  case RecurKind::UMin:
+    return Intrinsic::umin;
+  case RecurKind::UMax:
+    return Intrinsic::umax;
+  case RecurKind::SMin:
+    return Intrinsic::smin;
+  case RecurKind::SMax:
+    return Intrinsic::smax;
+  case RecurKind::FMin:
+    return Intrinsic::minnum;
+  case RecurKind::FMax:
+    return Intrinsic::maxnum;
+  }
+}
+
 CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   switch (RK) {
   default:
@@ -923,6 +942,13 @@ Value *llvm::createSelectCmpOp(IRBuilderBase &Builder, Value *StartVal,
 
 Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
+  Type *Ty = Left->getType();
+  if (Ty->isIntOrIntVectorTy()) {
+    // TODO: Add float minnum/maxnum support when FMF nnan is set.
+    Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
+    return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
+                                   "rdx.minmax");
+  }
   CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK);
   Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");

diff  --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
index 0202193bdd5b..91c905574f65 100644
--- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
+++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -225,9 +225,8 @@ define i64 @smax_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @smax_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX]], i32 0
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -239,9 +238,8 @@ define i64 @smin_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @smin_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX]], i32 0
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -253,9 +251,8 @@ define i64 @umax_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @umax_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX]], i32 0
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -267,9 +264,8 @@ define i64 @umin_i64(<2 x i64> %vec) {
 ; CHECK-LABEL: @umin_i64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]])
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX]], i32 0
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index fb8c7521906a..b6691ed6ecfa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -137,9 +137,8 @@ define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
 ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
-; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
-; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
+; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.smin.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]])
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[RDX]])
 entry:
   br label %for.body
 
@@ -171,9 +170,8 @@ define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
 ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
-; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
-; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
+; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.umax.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]])
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[RDX]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index c553977a8362..fb4b7d11851d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -141,9 +141,8 @@ define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
 ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
-; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
-; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
+; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.smin.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]])
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[RDX]])
 entry:
   br label %for.body
 
@@ -175,9 +174,8 @@ define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
 ; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
-; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
-; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
+; CHECK: %[[RDX:.*]] = call <vscale x 8 x i32> @llvm.umax.nxv8i32(<vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]])
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[RDX]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 59eaac895b19..bf5b2e901612 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -265,9 +265,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP17]], [[TMP18]]
-; UNROLL-NO-IC-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP17]], <4 x i32> [[TMP18]]
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[RDX_MINMAX_SELECT]])
+; UNROLL-NO-IC-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP17]], <4 x i32> [[TMP18]])
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[RDX_MINMAX]])
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i32 3
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -337,17 +336,16 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP15]], [[TMP16]]
-; UNROLL-NO-VF-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP15]], i32 [[TMP16]]
+; UNROLL-NO-VF-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP15]], i32 [[TMP16]])
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ poison, [[FOR_PREHEADER]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ poison, [[FOR_PREHEADER]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup.loopexit:
-; UNROLL-NO-VF-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], [[SCALAR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], [[SCALAR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[MINMAX_0_COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
index c2266435bbc5..8948225cf1d7 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -139,12 +139,10 @@ define i32 @smin_v4i32(ptr %p) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP4:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT5:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP4]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]])
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX2:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[RDX_SHUF3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[RDX_MINMAX2]], i32 0
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
@@ -192,12 +190,10 @@ define i32 @umax_v4i32(ptr %p) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP4:%.*]] = icmp ugt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT5:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP4]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]])
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX2:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[RDX_SHUF3]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[RDX_MINMAX2]], i32 0
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry: