[llvm] r313409 - [SLP] Revert r312791 and other necessary commits, except for TTI and

Fri Sep 15 15:23:27 PDT 2017

Author: chandlerc
Date: Fri Sep 15 15:23:27 2017
New Revision: 313409

URL: http://llvm.org/viewvc/llvm-project?rev=313409&view=rev
Log:
[SLP] Revert r312791 and other necessary commits, except for TTI and
CostModel.

The original patch added support for horizontal min/max reductions to
the SLP vectorizer.

This patch causes LLVM to miscompile fairly simple signed min
reductions. I have attached a test progrom to http://llvm.org/PR34635
that shows the behavior change after this patch. We found this in a test
for the open source Eigen library, but also in other code.

Unfortunately, the revert is moderately challenging. It required
reverting:
r313042: [SLP] Test with multiple uses of conditional op and wrong parent.
r312853: [SLP] Fix buildbots, NFC.
r312793: [SLP] Fix the warning about paths not returning the value, NFC.
r312791: [SLP] Support for horizontal min/max reduction.

And even then, I had to completely skip reverting the changes to TTI and
CostModel because r312832 rewrote so much of this code. Plus, the cost
modeling changes aren implicated in the miscompile, so they should be
fine and will just not be used until this gets re-introduced.

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=313409&r1=313408&r2=313409&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Sep 15 15:23:27 2017
@@ -4653,17 +4653,11 @@ class HorizontalReduction {
   // Use map vector to make stable output.
   MapVector<Instruction *, Value *> ExtraArgs;
 
-  /// Kind of the reduction data.
-  enum ReductionKind {
-    RK_None,       /// Not a reduction.
-    RK_Arithmetic, /// Binary reduction data.
-    RK_Min,        /// Minimum reduction data.
-    RK_UMin,       /// Unsigned minimum reduction data.
-    RK_Max,        /// Maximum reduction data.
-    RK_UMax,       /// Unsigned maximum reduction data.
-  };
   /// Contains info about operation, like its opcode, left and right operands.
-  class OperationData {
+  struct OperationData {
+    /// true if the operation is a reduced value, false if reduction operation.
+    bool IsReducedValue = false;
+
     /// Opcode of the instruction.
     unsigned Opcode = 0;
 
@@ -4672,21 +4666,12 @@ class HorizontalReduction {
 
     /// Right operand of the reduction operation.
     Value *RHS = nullptr;
-    /// Kind of the reduction operation.
-    ReductionKind Kind = RK_None;
-    /// True if float point min/max reduction has no NaNs.
-    bool NoNaN = false;
 
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable() const {
       return LHS && RHS &&
-             // We currently only support adds && min/max reductions.
-             ((Kind == RK_Arithmetic &&
-               (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
-              ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
-               (Kind == RK_Min || Kind == RK_Max)) ||
-              (Opcode == Instruction::ICmp &&
-               (Kind == RK_UMin || Kind == RK_UMax)));
+             // We currently only support adds.
+             (Opcode == Instruction::Add || Opcode == Instruction::FAdd);
     }
 
   public:
@@ -4694,92 +4679,43 @@ class HorizontalReduction {
 
     /// Construction for reduced values. They are identified by opcode only and
     /// don't have associated LHS/RHS values.
-    explicit OperationData(Value *V) : Kind(RK_None) {
+    explicit OperationData(Value *V) : IsReducedValue(true) {
       if (auto *I = dyn_cast<Instruction>(V))
         Opcode = I->getOpcode();
     }
 
-    /// Constructor for reduction operations with opcode and its left and
+    /// Constructor for binary reduction operations with opcode and its left and
     /// right operands.
-    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
-                  bool NoNaN = false)
-        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
-      assert(Kind != RK_None && "One of the reduction operations is expected.");
-    }
+    OperationData(unsigned Opcode, Value *LHS, Value *RHS)
+        : Opcode(Opcode), LHS(LHS), RHS(RHS) {}
+
     explicit operator bool() const { return Opcode; }
 
     /// Get the index of the first operand.
     unsigned getFirstOperandIndex() const {
       assert(!!*this && "The opcode is not set.");
-      switch (Kind) {
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
-        return 1;
-      case RK_Arithmetic:
-      case RK_None:
-        break;
-      }
       return 0;
     }
 
     /// Total number of operands in the reduction operation.
     unsigned getNumberOfOperands() const {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
+      assert(!IsReducedValue && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return 2;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
-        return 3;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return 2;
     }
 
     /// Expected number of uses for reduction operations/reduced values.
     unsigned getRequiredNumberOfUses() const {
-      assert(Kind != RK_None && !!*this && LHS && RHS &&
+      assert(!IsReducedValue && !!*this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return 1;
-      case RK_Min:
-      case RK_UMin:
-      case RK_Max:
-      case RK_UMax:
-        return 2;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return 1;
     }
 
     /// Checks if instruction is associative and can be vectorized.
     bool isAssociative(Instruction *I) const {
-      assert(Kind != RK_None && *this && LHS && RHS &&
+      assert(!IsReducedValue && *this && LHS && RHS &&
              "Expected reduction operation.");
-      switch (Kind) {
-      case RK_Arithmetic:
-        return I->isAssociative();
-      case RK_Min:
-      case RK_Max:
-        return Opcode == Instruction::ICmp ||
-               cast<Instruction>(I->getOperand(0))->hasUnsafeAlgebra();
-      case RK_UMin:
-      case RK_UMax:
-        assert(Opcode == Instruction::ICmp &&
-               "Only integer compare operation is expected.");
-        return true;
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
+      return I->isAssociative();
     }
 
     /// Checks if the reduction operation can be vectorized.
@@ -4790,17 +4726,18 @@ class HorizontalReduction {
     /// Checks if two operation data are both a reduction op or both a reduced
     /// value.
     bool operator==(const OperationData &OD) {
-      assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
+      assert(((IsReducedValue != OD.IsReducedValue) ||
+              ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
              "One of the comparing operations is incorrect.");
-      return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
+      return this == &OD ||
+             (IsReducedValue == OD.IsReducedValue && Opcode == OD.Opcode);
     }
     bool operator!=(const OperationData &OD) { return !(*this == OD); }
     void clear() {
+      IsReducedValue = false;
       Opcode = 0;
       LHS = nullptr;
       RHS = nullptr;
-      Kind = RK_None;
-      NoNaN = false;
     }
 
     /// Get the opcode of the reduction operation.
@@ -4809,81 +4746,16 @@ class HorizontalReduction {
       return Opcode;
     }
 
-    /// Get kind of reduction data.
-    ReductionKind getKind() const { return Kind; }
     Value *getLHS() const { return LHS; }
     Value *getRHS() const { return RHS; }
-    Type *getConditionType() const {
-      switch (Kind) {
-      case RK_Arithmetic:
-        return nullptr;
-      case RK_Min:
-      case RK_Max:
-      case RK_UMin:
-      case RK_UMax:
-        return CmpInst::makeCmpResultType(LHS->getType());
-      case RK_None:
-        break;
-      }
-      llvm_unreachable("Reduction kind is not set");
-    }
 
     /// Creates reduction operation with the current opcode.
     Value *createOp(IRBuilder<> &Builder, const Twine &Name = "") const {
-      assert(isVectorizable() &&
-             "Expected add|fadd or min/max reduction operation.");
-      Value *Cmp;
-      switch (Kind) {
-      case RK_Arithmetic:
-        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
-                                   Name);
-      case RK_Min:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
-                                          : Builder.CreateFCmpOLT(LHS, RHS);
-        break;
-      case RK_Max:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
-                                          : Builder.CreateFCmpOGT(LHS, RHS);
-        break;
-      case RK_UMin:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpULT(LHS, RHS);
-        break;
-      case RK_UMax:
-        assert(Opcode == Instruction::ICmp && "Expected integer types.");
-        Cmp = Builder.CreateICmpUGT(LHS, RHS);
-        break;
-      case RK_None:
-        llvm_unreachable("Unknown reduction operation.");
-      }
-      return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-    }
-    TargetTransformInfo::ReductionFlags getFlags() const {
-      TargetTransformInfo::ReductionFlags Flags;
-      Flags.NoNaN = NoNaN;
-      switch (Kind) {
-      case RK_Arithmetic:
-        break;
-      case RK_Min:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
-        Flags.IsMaxOp = false;
-        break;
-      case RK_Max:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
-        Flags.IsMaxOp = true;
-        break;
-      case RK_UMin:
-        Flags.IsSigned = false;
-        Flags.IsMaxOp = false;
-        break;
-      case RK_UMax:
-        Flags.IsSigned = false;
-        Flags.IsMaxOp = true;
-        break;
-      case RK_None:
-        llvm_unreachable("Reduction kind is not set");
-      }
-      return Flags;
+      assert(!IsReducedValue &&
+             (Opcode == Instruction::FAdd || Opcode == Instruction::Add) &&
+             "Expected add|fadd reduction operation.");
+      return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
+                                 Name);
     }
   };
 
@@ -4925,32 +4797,8 @@ class HorizontalReduction {
 
     Value *LHS;
     Value *RHS;
-    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
-      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
-                           RK_Arithmetic);
-    }
-    if (auto *Select = dyn_cast<SelectInst>(V)) {
-      // Look for a min/max pattern.
-      if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
-      } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-      } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Min,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
-      } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
-      } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-      } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Max,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
-      }
-    }
+    if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V))
+      return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS);
     return OperationData(V);
   }
 
@@ -5143,9 +4991,8 @@ public:
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, ReducedSubTree,
-                                        ReductionData.getKind());
-        VectorizedTree = VectReductionData.createOp(Builder, "op.rdx");
+                                        VectorizedTree, ReducedSubTree);
+        VectorizedTree = VectReductionData.createOp(Builder, "bin.rdx");
         propagateIRFlags(VectorizedTree, ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
@@ -5159,8 +5006,7 @@ public:
         auto *I = cast<Instruction>(ReducedVals[i]);
         Builder.SetCurrentDebugLocation(I->getDebugLoc());
         OperationData VectReductionData(ReductionData.getOpcode(),
-                                        VectorizedTree, I,
-                                        ReductionData.getKind());
+                                        VectorizedTree, I);
         VectorizedTree = VectReductionData.createOp(Builder);
         propagateIRFlags(VectorizedTree, ReductionOps);
       }
@@ -5171,9 +5017,8 @@ public:
         for (auto *I : Pair.second) {
           Builder.SetCurrentDebugLocation(I->getDebugLoc());
           OperationData VectReductionData(ReductionData.getOpcode(),
-                                          VectorizedTree, Pair.first,
-                                          ReductionData.getKind());
-          VectorizedTree = VectReductionData.createOp(Builder, "op.extra");
+                                          VectorizedTree, Pair.first);
+          VectorizedTree = VectReductionData.createOp(Builder, "bin.extra");
           propagateIRFlags(VectorizedTree, I);
         }
       }
@@ -5194,58 +5039,19 @@ private:
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
-    int PairwiseRdxCost;
-    int SplittingRdxCost;
-    switch (ReductionData.getKind()) {
-    case RK_Arithmetic:
-      PairwiseRdxCost =
-          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                          /*IsPairwiseForm=*/true);
-      SplittingRdxCost =
-          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
-                                          /*IsPairwiseForm=*/false);
-      break;
-    case RK_Min:
-    case RK_Max:
-    case RK_UMin:
-    case RK_UMax: {
-      Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
-      bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
-                        ReductionData.getKind() == RK_UMax;
-      PairwiseRdxCost =
-          TTI->getMinMaxReductionCost(VecTy, VecCondTy,
-                                      /*IsPairwiseForm=*/true, IsUnsigned);
-      SplittingRdxCost =
-          TTI->getMinMaxReductionCost(VecTy, VecCondTy,
-                                      /*IsPairwiseForm=*/false, IsUnsigned);
-      break;
-    }
-    case RK_None:
-      llvm_unreachable("Expected arithmetic or min/max reduction operation");
-    }
+    int PairwiseRdxCost =
+        TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+                                        /*IsPairwiseForm=*/true);
+    int SplittingRdxCost =
+        TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
+                                        /*IsPairwiseForm=*/false);
 
     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
 
-    int ScalarReduxCost;
-    switch (ReductionData.getKind()) {
-    case RK_Arithmetic:
-      ScalarReduxCost =
-          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
-      break;
-    case RK_Min:
-    case RK_Max:
-    case RK_UMin:
-    case RK_UMax:
-      ScalarReduxCost =
-          TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
-      break;
-    case RK_None:
-      llvm_unreachable("Expected arithmetic or min/max reduction operation");
-    }
-    ScalarReduxCost *= (ReduxWidth - 1);
+    int ScalarReduxCost =
+        (ReduxWidth - 1) *
+        TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
 
     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
                  << " for reduction that starts with " << *FirstReducedVal
@@ -5267,7 +5073,7 @@ private:
     if (!IsPairwiseReduction)
       return createSimpleTargetReduction(
           Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
-          ReductionData.getFlags(), RedOps);
+          TargetTransformInfo::ReductionFlags(), RedOps);
 
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
@@ -5282,8 +5088,8 @@ private:
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
       OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
-                                      RightShuf, ReductionData.getKind());
-      TmpVec = VectReductionData.createOp(Builder, "op.rdx");
+                                      RightShuf);
+      TmpVec = VectReductionData.createOp(Builder, "bin.rdx");
       propagateIRFlags(TmpVec, RedOps);
     }
 
@@ -5444,11 +5250,9 @@ static bool tryToVectorizeHorReductionOr
     auto *Inst = dyn_cast<Instruction>(V);
     if (!Inst)
       continue;
-    auto *BI = dyn_cast<BinaryOperator>(Inst);
-    auto *SI = dyn_cast<SelectInst>(Inst);
-    if (BI || SI) {
+    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
       HorizontalReduction HorRdx;
-      if (HorRdx.matchAssociativeReduction(P, Inst)) {
+      if (HorRdx.matchAssociativeReduction(P, BI)) {
         if (HorRdx.tryToReduce(R, TTI)) {
           Res = true;
           // Set P to nullptr to avoid re-analysis of phi node in
@@ -5457,7 +5261,7 @@ static bool tryToVectorizeHorReductionOr
           continue;
         }
       }
-      if (P && BI) {
+      if (P) {
         Inst = dyn_cast<Instruction>(BI->getOperand(0));
         if (Inst == P)
           Inst = dyn_cast<Instruction>(BI->getOperand(1));

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll?rev=313409&r1=313408&r2=313409&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll Fri Sep 15 15:23:27 2017
@@ -117,11 +117,11 @@ define float @bazz() {
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
 ; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
-; CHECK-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
-; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+; CHECK-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @bazz(
 ; THRESHOLD-NEXT:  entry:
@@ -148,11 +148,11 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
 ; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
-; THRESHOLD-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+; THRESHOLD-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -327,53 +327,47 @@ entry:
 define float @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
-; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
-; CHECK-NEXT:    store float [[TMP7]], float* @res, align 4
-; CHECK-NEXT:    ret float [[TMP7]]
+; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
 ; THRESHOLD-LABEL: @bar(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
 ; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[TMP5]]
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[TMP6]]
-; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
-; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
-; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
-; THRESHOLD-NEXT:    store float [[TMP7]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[TMP7]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
 entry:
   %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
@@ -518,9 +512,9 @@ define float @f(float* nocapture readonl
 ; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
-; CHECK-NEXT:    ret float [[OP_RDX]]
+; CHECK-NEXT:    ret float [[BIN_RDX17]]
 ;
 ; THRESHOLD-LABEL: @f(
 ; THRESHOLD-NEXT:  entry:
@@ -641,9 +635,9 @@ define float @f(float* nocapture readonl
 ; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
-; THRESHOLD-NEXT:    ret float [[OP_RDX]]
+; THRESHOLD-NEXT:    ret float [[BIN_RDX17]]
 ;
   entry:
   %0 = load float, float* %x, align 4
@@ -871,9 +865,9 @@ define float @f1(float* nocapture readon
 ; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; CHECK-NEXT:    ret float [[OP_EXTRA]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA]]
 ;
 ; THRESHOLD-LABEL: @f1(
 ; THRESHOLD-NEXT:  entry:
@@ -954,9 +948,9 @@ define float @f1(float* nocapture readon
 ; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA]]
 ;
   entry:
   %rem = srem i32 %a, %b
@@ -1144,14 +1138,14 @@ define float @loadadd31(float* nocapture
 ; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
-; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
-; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
+; CHECK-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
+; CHECK-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
+; CHECK-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; CHECK-NEXT:    ret float [[TMP12]]
@@ -1240,14 +1234,14 @@ define float @loadadd31(float* nocapture
 ; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
-; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
-; THRESHOLD-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
+; THRESHOLD-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
+; THRESHOLD-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
 ; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; THRESHOLD-NEXT:    ret float [[TMP12]]
@@ -1375,10 +1369,10 @@ define float @extra_args(float* nocaptur
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args(
 ; THRESHOLD-NEXT:  entry:
@@ -1409,10 +1403,10 @@ define float @extra_args(float* nocaptur
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1477,12 +1471,12 @@ define float @extra_args_same_several_ti
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
-; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
-; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[OP_EXTRA7]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA7]]
 ;
 ; THRESHOLD-LABEL: @extra_args_same_several_times(
 ; THRESHOLD-NEXT:  entry:
@@ -1515,12 +1509,12 @@ define float @extra_args_same_several_ti
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
-; THRESHOLD-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
-; THRESHOLD-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA7]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA7]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1587,10 +1581,10 @@ define float @extra_args_no_replace(floa
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
 ; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 ; THRESHOLD-LABEL: @extra_args_no_replace(
 ; THRESHOLD-NEXT:  entry:
@@ -1623,10 +1617,10 @@ define float @extra_args_no_replace(floa
 ; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
-; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
 ; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
-; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1685,10 +1679,10 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
-; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
 ; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
-; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA3]]
 ;
 ; THRESHOLD-LABEL: @wobble(
 ; THRESHOLD-NEXT:  bb:
@@ -1713,10 +1707,10 @@ define i32 @wobble(i32 %arg, i32 %bar) {
 ; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
-; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
 ; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
-; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
+; THRESHOLD-NEXT:    ret i32 [[BIN_EXTRA3]]
 ;
   bb:
   %x1 = xor i32 %arg, %bar

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll?rev=313409&r1=313408&r2=313409&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll Fri Sep 15 15:23:27 2017
@@ -2,11 +2,10 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefix=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefix=SKX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -slp-vectorizer -S | FileCheck %s --check-prefix=SKX
 
 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
 @arr1 = local_unnamed_addr global [32 x float] zeroinitializer, align 16
- at var = global i32 zeroinitializer, align 8
 
 define i32 @maxi8(i32) {
 ; CHECK-LABEL: @maxi8(
@@ -35,112 +34,79 @@ define i32 @maxi8(i32) {
 ; CHECK-NEXT:    ret i32 [[TMP23]]
 ;
 ; AVX-LABEL: @maxi8(
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP24]]
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX-NEXT:    ret i32 [[TMP23]]
 ;
 ; AVX2-LABEL: @maxi8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP24]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX2-NEXT:    ret i32 [[TMP23]]
 ;
 ; SKX-LABEL: @maxi8(
-; SKX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP24]]
+; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; SKX-NEXT:    ret i32 [[TMP23]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -218,193 +184,151 @@ define i32 @maxi16(i32) {
 ; CHECK-NEXT:    ret i32 [[TMP47]]
 ;
 ; AVX-LABEL: @maxi16(
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP48]]
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; AVX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; AVX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; AVX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; AVX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; AVX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; AVX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; AVX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; AVX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; AVX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; AVX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; AVX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; AVX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; AVX-NEXT:    ret i32 [[TMP47]]
 ;
 ; AVX2-LABEL: @maxi16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP48]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; AVX2-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; AVX2-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; AVX2-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; AVX2-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; AVX2-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; AVX2-NEXT:    ret i32 [[TMP47]]
 ;
 ; SKX-LABEL: @maxi16(
-; SKX-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP48:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP48]]
+; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; SKX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; SKX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; SKX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; SKX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; SKX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; SKX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; SKX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; SKX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; SKX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; SKX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; SKX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; SKX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; SKX-NEXT:    ret i32 [[TMP47]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -457,460 +381,392 @@ define i32 @maxi16(i32) {
 
 define i32 @maxi32(i32) {
 ; CHECK-LABEL: @maxi32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; CHECK-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; CHECK-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; CHECK-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; CHECK-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; CHECK-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; CHECK-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; CHECK-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; CHECK-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; CHECK-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; CHECK-NEXT:    ret i32 [[TMP96]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
+; CHECK-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
+; CHECK-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
+; CHECK-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
+; CHECK-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
+; CHECK-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
+; CHECK-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
+; CHECK-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
+; CHECK-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
+; CHECK-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
+; CHECK-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
+; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
+; CHECK-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
+; CHECK-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
+; CHECK-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
+; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
+; CHECK-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
+; CHECK-NEXT:    ret i32 [[TMP95]]
 ;
 ; AVX-LABEL: @maxi32(
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; AVX-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; AVX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; AVX-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; AVX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; AVX-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; AVX-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; AVX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; AVX-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; AVX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; AVX-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; AVX-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; AVX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; AVX-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; AVX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; AVX-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; AVX-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; AVX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; AVX-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; AVX-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; AVX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; AVX-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; AVX-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; AVX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; AVX-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; AVX-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; AVX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; AVX-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; AVX-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; AVX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; AVX-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; AVX-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; AVX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; AVX-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; AVX-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; AVX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; AVX-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; AVX-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; AVX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; AVX-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; AVX-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; AVX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; AVX-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; AVX-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; AVX-NEXT:    ret i32 [[TMP96]]
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; AVX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; AVX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; AVX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; AVX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; AVX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; AVX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; AVX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; AVX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; AVX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; AVX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; AVX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; AVX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; AVX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; AVX-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
+; AVX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
+; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
+; AVX-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
+; AVX-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
+; AVX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
+; AVX-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
+; AVX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
+; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
+; AVX-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
+; AVX-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
+; AVX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
+; AVX-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
+; AVX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
+; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
+; AVX-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
+; AVX-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
+; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
+; AVX-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
+; AVX-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
+; AVX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
+; AVX-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
+; AVX-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
+; AVX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
+; AVX-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
+; AVX-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
+; AVX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
+; AVX-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
+; AVX-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
+; AVX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
+; AVX-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
+; AVX-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
+; AVX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
+; AVX-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
+; AVX-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
+; AVX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
+; AVX-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
+; AVX-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
+; AVX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
+; AVX-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
+; AVX-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
+; AVX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
+; AVX-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
+; AVX-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
+; AVX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
+; AVX-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
+; AVX-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
+; AVX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
+; AVX-NEXT:    ret i32 [[TMP95]]
 ;
 ; AVX2-LABEL: @maxi32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; AVX2-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; AVX2-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; AVX2-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; AVX2-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; AVX2-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; AVX2-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; AVX2-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; AVX2-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; AVX2-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; AVX2-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; AVX2-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; AVX2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; AVX2-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; AVX2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; AVX2-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; AVX2-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; AVX2-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; AVX2-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; AVX2-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; AVX2-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; AVX2-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; AVX2-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; AVX2-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; AVX2-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; AVX2-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; AVX2-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; AVX2-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; AVX2-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; AVX2-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; AVX2-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; AVX2-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; AVX2-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; AVX2-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; AVX2-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; AVX2-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; AVX2-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; AVX2-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; AVX2-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; AVX2-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; AVX2-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; AVX2-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; AVX2-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; AVX2-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; AVX2-NEXT:    ret i32 [[TMP96]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; AVX2-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; AVX2-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; AVX2-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; AVX2-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; AVX2-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; AVX2-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; AVX2-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
+; AVX2-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
+; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
+; AVX2-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
+; AVX2-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
+; AVX2-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
+; AVX2-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
+; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
+; AVX2-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
+; AVX2-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
+; AVX2-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
+; AVX2-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
+; AVX2-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
+; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
+; AVX2-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
+; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
+; AVX2-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
+; AVX2-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
+; AVX2-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
+; AVX2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
+; AVX2-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
+; AVX2-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
+; AVX2-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
+; AVX2-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
+; AVX2-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
+; AVX2-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
+; AVX2-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
+; AVX2-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
+; AVX2-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
+; AVX2-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
+; AVX2-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
+; AVX2-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
+; AVX2-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
+; AVX2-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
+; AVX2-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
+; AVX2-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
+; AVX2-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
+; AVX2-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
+; AVX2-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
+; AVX2-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
+; AVX2-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
+; AVX2-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
+; AVX2-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
+; AVX2-NEXT:    ret i32 [[TMP95]]
 ;
 ; SKX-LABEL: @maxi32(
-; SKX-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <32 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <32 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <32 x i32> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <32 x i32> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <32 x i32> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <32 x i32> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP24]], i32 undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP27]], i32 undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <32 x i32> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = icmp sgt i32 [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP30]], i32 undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <32 x i32> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP33]], i32 undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP36]], i32 undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <32 x i32> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP39]], i32 undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = icmp sgt i32 [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP42]], i32 undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <32 x i32> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP45]], i32 undef
-; SKX-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP2]], i32 16
-; SKX-NEXT:    [[TMP50:%.*]] = icmp sgt i32 [[TMP48]], [[TMP49]]
-; SKX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], i32 [[TMP48]], i32 undef
-; SKX-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP2]], i32 17
-; SKX-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP51]], [[TMP52]]
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP51]], i32 undef
-; SKX-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP2]], i32 18
-; SKX-NEXT:    [[TMP56:%.*]] = icmp sgt i32 [[TMP54]], [[TMP55]]
-; SKX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], i32 [[TMP54]], i32 undef
-; SKX-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP2]], i32 19
-; SKX-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP57]], [[TMP58]]
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP57]], i32 undef
-; SKX-NEXT:    [[TMP61:%.*]] = extractelement <32 x i32> [[TMP2]], i32 20
-; SKX-NEXT:    [[TMP62:%.*]] = icmp sgt i32 [[TMP60]], [[TMP61]]
-; SKX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[TMP60]], i32 undef
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[TMP2]], i32 21
-; SKX-NEXT:    [[TMP65:%.*]] = icmp sgt i32 [[TMP63]], [[TMP64]]
-; SKX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[TMP63]], i32 undef
-; SKX-NEXT:    [[TMP67:%.*]] = extractelement <32 x i32> [[TMP2]], i32 22
-; SKX-NEXT:    [[TMP68:%.*]] = icmp sgt i32 [[TMP66]], [[TMP67]]
-; SKX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[TMP66]], i32 undef
-; SKX-NEXT:    [[TMP70:%.*]] = extractelement <32 x i32> [[TMP2]], i32 23
-; SKX-NEXT:    [[TMP71:%.*]] = icmp sgt i32 [[TMP69]], [[TMP70]]
-; SKX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], i32 [[TMP69]], i32 undef
-; SKX-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP2]], i32 24
-; SKX-NEXT:    [[TMP74:%.*]] = icmp sgt i32 [[TMP72]], [[TMP73]]
-; SKX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], i32 [[TMP72]], i32 undef
-; SKX-NEXT:    [[TMP76:%.*]] = extractelement <32 x i32> [[TMP2]], i32 25
-; SKX-NEXT:    [[TMP77:%.*]] = icmp sgt i32 [[TMP75]], [[TMP76]]
-; SKX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], i32 [[TMP75]], i32 undef
-; SKX-NEXT:    [[TMP79:%.*]] = extractelement <32 x i32> [[TMP2]], i32 26
-; SKX-NEXT:    [[TMP80:%.*]] = icmp sgt i32 [[TMP78]], [[TMP79]]
-; SKX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP78]], i32 undef
-; SKX-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP2]], i32 27
-; SKX-NEXT:    [[TMP83:%.*]] = icmp sgt i32 [[TMP81]], [[TMP82]]
-; SKX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], i32 [[TMP81]], i32 undef
-; SKX-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP2]], i32 28
-; SKX-NEXT:    [[TMP86:%.*]] = icmp sgt i32 [[TMP84]], [[TMP85]]
-; SKX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], i32 [[TMP84]], i32 undef
-; SKX-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP2]], i32 29
-; SKX-NEXT:    [[TMP89:%.*]] = icmp sgt i32 [[TMP87]], [[TMP88]]
-; SKX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], i32 [[TMP87]], i32 undef
-; SKX-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP2]], i32 30
-; SKX-NEXT:    [[TMP92:%.*]] = icmp sgt i32 [[TMP90]], [[TMP91]]
-; SKX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[TMP90]], i32 undef
-; SKX-NEXT:    [[TMP94:%.*]] = extractelement <32 x i32> [[TMP2]], i32 31
-; SKX-NEXT:    [[TMP95:%.*]] = icmp sgt i32 [[TMP93]], [[TMP94]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], i32 [[TMP93]], i32 undef
-; SKX-NEXT:    ret i32 [[TMP96]]
+; SKX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; SKX-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 [[TMP24]]
+; SKX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
+; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 [[TMP27]]
+; SKX-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
+; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 [[TMP30]]
+; SKX-NEXT:    [[TMP33:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+; SKX-NEXT:    [[TMP34:%.*]] = icmp sgt i32 [[TMP32]], [[TMP33]]
+; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP32]], i32 [[TMP33]]
+; SKX-NEXT:    [[TMP36:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP35]], i32 [[TMP36]]
+; SKX-NEXT:    [[TMP39:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+; SKX-NEXT:    [[TMP40:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP38]], i32 [[TMP39]]
+; SKX-NEXT:    [[TMP42:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+; SKX-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP41]], [[TMP42]]
+; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP41]], i32 [[TMP42]]
+; SKX-NEXT:    [[TMP45:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+; SKX-NEXT:    [[TMP46:%.*]] = icmp sgt i32 [[TMP44]], [[TMP45]]
+; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP44]], i32 [[TMP45]]
+; SKX-NEXT:    [[TMP48:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
+; SKX-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP47]], [[TMP48]]
+; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP47]], i32 [[TMP48]]
+; SKX-NEXT:    [[TMP51:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
+; SKX-NEXT:    [[TMP52:%.*]] = icmp sgt i32 [[TMP50]], [[TMP51]]
+; SKX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP50]], i32 [[TMP51]]
+; SKX-NEXT:    [[TMP54:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
+; SKX-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP53]], [[TMP54]]
+; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP53]], i32 [[TMP54]]
+; SKX-NEXT:    [[TMP57:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
+; SKX-NEXT:    [[TMP58:%.*]] = icmp sgt i32 [[TMP56]], [[TMP57]]
+; SKX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], i32 [[TMP56]], i32 [[TMP57]]
+; SKX-NEXT:    [[TMP60:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
+; SKX-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP59]], [[TMP60]]
+; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP59]], i32 [[TMP60]]
+; SKX-NEXT:    [[TMP63:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
+; SKX-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], [[TMP63]]
+; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], i32 [[TMP62]], i32 [[TMP63]]
+; SKX-NEXT:    [[TMP66:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
+; SKX-NEXT:    [[TMP67:%.*]] = icmp sgt i32 [[TMP65]], [[TMP66]]
+; SKX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], i32 [[TMP65]], i32 [[TMP66]]
+; SKX-NEXT:    [[TMP69:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
+; SKX-NEXT:    [[TMP70:%.*]] = icmp sgt i32 [[TMP68]], [[TMP69]]
+; SKX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP68]], i32 [[TMP69]]
+; SKX-NEXT:    [[TMP72:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
+; SKX-NEXT:    [[TMP73:%.*]] = icmp sgt i32 [[TMP71]], [[TMP72]]
+; SKX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[TMP71]], i32 [[TMP72]]
+; SKX-NEXT:    [[TMP75:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
+; SKX-NEXT:    [[TMP76:%.*]] = icmp sgt i32 [[TMP74]], [[TMP75]]
+; SKX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[TMP74]], i32 [[TMP75]]
+; SKX-NEXT:    [[TMP78:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
+; SKX-NEXT:    [[TMP79:%.*]] = icmp sgt i32 [[TMP77]], [[TMP78]]
+; SKX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[TMP77]], i32 [[TMP78]]
+; SKX-NEXT:    [[TMP81:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
+; SKX-NEXT:    [[TMP82:%.*]] = icmp sgt i32 [[TMP80]], [[TMP81]]
+; SKX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], i32 [[TMP80]], i32 [[TMP81]]
+; SKX-NEXT:    [[TMP84:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
+; SKX-NEXT:    [[TMP85:%.*]] = icmp sgt i32 [[TMP83]], [[TMP84]]
+; SKX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], i32 [[TMP83]], i32 [[TMP84]]
+; SKX-NEXT:    [[TMP87:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
+; SKX-NEXT:    [[TMP88:%.*]] = icmp sgt i32 [[TMP86]], [[TMP87]]
+; SKX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP86]], i32 [[TMP87]]
+; SKX-NEXT:    [[TMP90:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
+; SKX-NEXT:    [[TMP91:%.*]] = icmp sgt i32 [[TMP89]], [[TMP90]]
+; SKX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], i32 [[TMP89]], i32 [[TMP90]]
+; SKX-NEXT:    [[TMP93:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
+; SKX-NEXT:    [[TMP94:%.*]] = icmp sgt i32 [[TMP92]], [[TMP93]]
+; SKX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], i32 [[TMP92]], i32 [[TMP93]]
+; SKX-NEXT:    ret i32 [[TMP95]]
 ;
   %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
   %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -1036,112 +892,79 @@ define float @maxf8(float) {
 ; CHECK-NEXT:    ret float [[TMP23]]
 ;
 ; AVX-LABEL: @maxf8(
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    ret float [[TMP24]]
+; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX-NEXT:    ret float [[TMP23]]
 ;
 ; AVX2-LABEL: @maxf8(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    ret float [[TMP24]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX2-NEXT:    ret float [[TMP23]]
 ;
 ; SKX-LABEL: @maxf8(
-; SKX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <8 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    ret float [[TMP24]]
+; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; SKX-NEXT:    ret float [[TMP23]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1219,193 +1042,151 @@ define float @maxf16(float) {
 ; CHECK-NEXT:    ret float [[TMP47]]
 ;
 ; AVX-LABEL: @maxf16(
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX-NEXT:    ret float [[TMP48]]
+; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; AVX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; AVX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; AVX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; AVX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; AVX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; AVX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; AVX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; AVX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; AVX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; AVX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; AVX-NEXT:    ret float [[TMP47]]
 ;
 ; AVX2-LABEL: @maxf16(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; AVX2-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX2-NEXT:    ret float [[TMP48]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; AVX2-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; AVX2-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; AVX2-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; AVX2-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; AVX2-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; AVX2-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; AVX2-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; AVX2-NEXT:    ret float [[TMP47]]
 ;
 ; SKX-LABEL: @maxf16(
-; SKX-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <16 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <16 x float> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <16 x float> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
-; SKX-NEXT:    [[TMP48:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
-; SKX-NEXT:    [[TMP49:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; SKX-NEXT:    ret float [[TMP48]]
+; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; SKX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; SKX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; SKX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; SKX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; SKX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; SKX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; SKX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; SKX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; SKX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; SKX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; SKX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; SKX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; SKX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; SKX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; SKX-NEXT:    ret float [[TMP47]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1555,346 +1336,295 @@ define float @maxf32(float) {
 ; CHECK-NEXT:    ret float [[TMP95]]
 ;
 ; AVX-LABEL: @maxf32(
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; AVX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; AVX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; AVX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; AVX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; AVX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; AVX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; AVX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; AVX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; AVX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; AVX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; AVX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; AVX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; AVX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; AVX-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; AVX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; AVX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; AVX-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; AVX-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; AVX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; AVX-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; AVX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; AVX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; AVX-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; AVX-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; AVX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; AVX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; AVX-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; AVX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; AVX-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; AVX-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; AVX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; AVX-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; AVX-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; AVX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; AVX-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; AVX-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; AVX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; AVX-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; AVX-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; AVX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; AVX-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; AVX-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; AVX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; AVX-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; AVX-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; AVX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; AVX-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; AVX-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; AVX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; AVX-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; AVX-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; AVX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; AVX-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; AVX-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; AVX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; AVX-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; AVX-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; AVX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; AVX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; AVX-NEXT:    ret float [[TMP96]]
+; AVX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; AVX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; AVX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; AVX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; AVX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; AVX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; AVX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; AVX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; AVX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; AVX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; AVX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; AVX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; AVX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; AVX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; AVX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; AVX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; AVX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; AVX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; AVX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; AVX-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+; AVX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
+; AVX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
+; AVX-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+; AVX-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
+; AVX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
+; AVX-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+; AVX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
+; AVX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
+; AVX-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+; AVX-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
+; AVX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
+; AVX-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+; AVX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
+; AVX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
+; AVX-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+; AVX-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
+; AVX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
+; AVX-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+; AVX-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
+; AVX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
+; AVX-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+; AVX-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
+; AVX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
+; AVX-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+; AVX-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
+; AVX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
+; AVX-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+; AVX-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
+; AVX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
+; AVX-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+; AVX-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
+; AVX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
+; AVX-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+; AVX-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
+; AVX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
+; AVX-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+; AVX-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
+; AVX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
+; AVX-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+; AVX-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
+; AVX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
+; AVX-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+; AVX-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
+; AVX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
+; AVX-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+; AVX-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
+; AVX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
+; AVX-NEXT:    ret float [[TMP95]]
 ;
 ; AVX2-LABEL: @maxf32(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; AVX2-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; AVX2-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; AVX2-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; AVX2-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; AVX2-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; AVX2-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; AVX2-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; AVX2-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; AVX2-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; AVX2-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; AVX2-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; AVX2-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; AVX2-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; AVX2-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; AVX2-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; AVX2-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; AVX2-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; AVX2-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; AVX2-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; AVX2-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; AVX2-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; AVX2-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; AVX2-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; AVX2-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; AVX2-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; AVX2-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; AVX2-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; AVX2-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; AVX2-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; AVX2-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; AVX2-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; AVX2-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; AVX2-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; AVX2-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; AVX2-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; AVX2-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; AVX2-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; AVX2-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; AVX2-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; AVX2-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; AVX2-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; AVX2-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; AVX2-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; AVX2-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; AVX2-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; AVX2-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; AVX2-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; AVX2-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; AVX2-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; AVX2-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; AVX2-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; AVX2-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; AVX2-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; AVX2-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; AVX2-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; AVX2-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; AVX2-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; AVX2-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; AVX2-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; AVX2-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; AVX2-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; AVX2-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; AVX2-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; AVX2-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; AVX2-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; AVX2-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; AVX2-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; AVX2-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; AVX2-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; AVX2-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; AVX2-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; AVX2-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; AVX2-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; AVX2-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; AVX2-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; AVX2-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; AVX2-NEXT:    ret float [[TMP96]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; AVX2-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; AVX2-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; AVX2-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; AVX2-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; AVX2-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; AVX2-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; AVX2-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; AVX2-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; AVX2-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; AVX2-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; AVX2-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; AVX2-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; AVX2-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; AVX2-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; AVX2-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; AVX2-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+; AVX2-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
+; AVX2-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
+; AVX2-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+; AVX2-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
+; AVX2-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
+; AVX2-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+; AVX2-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
+; AVX2-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
+; AVX2-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+; AVX2-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
+; AVX2-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
+; AVX2-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+; AVX2-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
+; AVX2-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
+; AVX2-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+; AVX2-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
+; AVX2-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+; AVX2-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
+; AVX2-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+; AVX2-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
+; AVX2-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+; AVX2-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
+; AVX2-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
+; AVX2-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+; AVX2-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
+; AVX2-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
+; AVX2-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+; AVX2-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
+; AVX2-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
+; AVX2-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+; AVX2-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
+; AVX2-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
+; AVX2-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+; AVX2-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
+; AVX2-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
+; AVX2-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+; AVX2-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
+; AVX2-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
+; AVX2-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+; AVX2-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
+; AVX2-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
+; AVX2-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+; AVX2-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
+; AVX2-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
+; AVX2-NEXT:    ret float [[TMP95]]
 ;
 ; SKX-LABEL: @maxf32(
-; SKX-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <32 x float> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float undef, float undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <32 x float> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <32 x float> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float undef
-; SKX-NEXT:    [[TMP13:%.*]] = extractelement <32 x float> [[TMP2]], i32 4
-; SKX-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float undef
-; SKX-NEXT:    [[TMP16:%.*]] = extractelement <32 x float> [[TMP2]], i32 5
-; SKX-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float undef
-; SKX-NEXT:    [[TMP19:%.*]] = extractelement <32 x float> [[TMP2]], i32 6
-; SKX-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float undef
-; SKX-NEXT:    [[TMP22:%.*]] = extractelement <32 x float> [[TMP2]], i32 7
-; SKX-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
-; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float undef
-; SKX-NEXT:    [[TMP25:%.*]] = extractelement <32 x float> [[TMP2]], i32 8
-; SKX-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float undef
-; SKX-NEXT:    [[TMP28:%.*]] = extractelement <32 x float> [[TMP2]], i32 9
-; SKX-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
-; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float undef
-; SKX-NEXT:    [[TMP31:%.*]] = extractelement <32 x float> [[TMP2]], i32 10
-; SKX-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
-; SKX-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float undef
-; SKX-NEXT:    [[TMP34:%.*]] = extractelement <32 x float> [[TMP2]], i32 11
-; SKX-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
-; SKX-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float undef
-; SKX-NEXT:    [[TMP37:%.*]] = extractelement <32 x float> [[TMP2]], i32 12
-; SKX-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
-; SKX-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float undef
-; SKX-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP2]], i32 13
-; SKX-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
-; SKX-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float undef
-; SKX-NEXT:    [[TMP43:%.*]] = extractelement <32 x float> [[TMP2]], i32 14
-; SKX-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
-; SKX-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float undef
-; SKX-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP2]], i32 15
-; SKX-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
-; SKX-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float undef
-; SKX-NEXT:    [[TMP49:%.*]] = extractelement <32 x float> [[TMP2]], i32 16
-; SKX-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
-; SKX-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float undef
-; SKX-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP2]], i32 17
-; SKX-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
-; SKX-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float undef
-; SKX-NEXT:    [[TMP55:%.*]] = extractelement <32 x float> [[TMP2]], i32 18
-; SKX-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
-; SKX-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float undef
-; SKX-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP2]], i32 19
-; SKX-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
-; SKX-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float undef
-; SKX-NEXT:    [[TMP61:%.*]] = extractelement <32 x float> [[TMP2]], i32 20
-; SKX-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
-; SKX-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float undef
-; SKX-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP2]], i32 21
-; SKX-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
-; SKX-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float undef
-; SKX-NEXT:    [[TMP67:%.*]] = extractelement <32 x float> [[TMP2]], i32 22
-; SKX-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
-; SKX-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float undef
-; SKX-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP2]], i32 23
-; SKX-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
-; SKX-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float undef
-; SKX-NEXT:    [[TMP73:%.*]] = extractelement <32 x float> [[TMP2]], i32 24
-; SKX-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
-; SKX-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float undef
-; SKX-NEXT:    [[TMP76:%.*]] = extractelement <32 x float> [[TMP2]], i32 25
-; SKX-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
-; SKX-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float undef
-; SKX-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP2]], i32 26
-; SKX-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
-; SKX-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float undef
-; SKX-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP2]], i32 27
-; SKX-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
-; SKX-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float undef
-; SKX-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP2]], i32 28
-; SKX-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
-; SKX-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float undef
-; SKX-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP2]], i32 29
-; SKX-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
-; SKX-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float undef
-; SKX-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP2]], i32 30
-; SKX-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
-; SKX-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float undef
-; SKX-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP2]], i32 31
-; SKX-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
-; SKX-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
-; SKX-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
-; SKX-NEXT:    [[TMP96:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
-; SKX-NEXT:    [[TMP97:%.*]] = select i1 [[TMP95]], float [[TMP93]], float undef
-; SKX-NEXT:    ret float [[TMP96]]
+; SKX-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; SKX-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; SKX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; SKX-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; SKX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; SKX-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; SKX-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; SKX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; SKX-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; SKX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; SKX-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; SKX-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; SKX-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; SKX-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; SKX-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; SKX-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; SKX-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; SKX-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; SKX-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; SKX-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; SKX-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; SKX-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; SKX-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; SKX-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; SKX-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; SKX-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; SKX-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; SKX-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; SKX-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; SKX-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; SKX-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; SKX-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; SKX-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+; SKX-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
+; SKX-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
+; SKX-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+; SKX-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
+; SKX-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
+; SKX-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+; SKX-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
+; SKX-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
+; SKX-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+; SKX-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
+; SKX-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
+; SKX-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+; SKX-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
+; SKX-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
+; SKX-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+; SKX-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
+; SKX-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
+; SKX-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+; SKX-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
+; SKX-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
+; SKX-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+; SKX-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
+; SKX-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
+; SKX-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+; SKX-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
+; SKX-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
+; SKX-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+; SKX-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
+; SKX-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
+; SKX-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+; SKX-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
+; SKX-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
+; SKX-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+; SKX-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
+; SKX-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
+; SKX-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+; SKX-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
+; SKX-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
+; SKX-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+; SKX-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
+; SKX-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
+; SKX-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+; SKX-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
+; SKX-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
+; SKX-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+; SKX-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
+; SKX-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
+; SKX-NEXT:    ret float [[TMP95]]
 ;
   %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
   %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@@ -1993,368 +1723,3 @@ define float @maxf32(float) {
   ret float %95
 }
 
-define i32 @maxi8_mutiple_uses(i32) {
-; CHECK-LABEL: @maxi8_mutiple_uses(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP4]], i32 3, i32 4
-; CHECK-NEXT:    store i32 [[TMP24]], i32* @var, align 8
-; CHECK-NEXT:    ret i32 [[TMP23]]
-;
-; AVX-LABEL: @maxi8_mutiple_uses(
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; AVX-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; AVX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; AVX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; AVX-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; AVX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; AVX-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; AVX-NEXT:    ret i32 [[TMP31]]
-;
-; AVX2-LABEL: @maxi8_mutiple_uses(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; AVX2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; AVX2-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; AVX2-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; AVX2-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; AVX2-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; AVX2-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; AVX2-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; AVX2-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; AVX2-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; AVX2-NEXT:    ret i32 [[TMP31]]
-;
-; SKX-LABEL: @maxi8_mutiple_uses(
-; SKX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr to <4 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 undef, i32 undef
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; SKX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 undef
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; SKX-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; SKX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 [[TMP13]]
-; SKX-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 [[TMP16]]
-; SKX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP13]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP13]]
-; SKX-NEXT:    [[TMP24:%.*]] = icmp sgt i32 [[TMP23]], [[TMP16]]
-; SKX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP16]]
-; SKX-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP25]], [[TMP19]]
-; SKX-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP19]]
-; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; SKX-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[TMP30:%.*]] = icmp sgt i32 [[TMP27]], [[TMP29]]
-; SKX-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP27]], i32 [[TMP29]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; SKX-NEXT:    store i32 [[TMP32]], i32* @var, align 8
-; SKX-NEXT:    ret i32 [[TMP31]]
-;
-  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-  %4 = icmp sgt i32 %2, %3
-  %5 = select i1 %4, i32 %2, i32 %3
-  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-  %7 = icmp sgt i32 %5, %6
-  %8 = select i1 %7, i32 %5, i32 %6
-  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-  %10 = icmp sgt i32 %8, %9
-  %11 = select i1 %10, i32 %8, i32 %9
-  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-  %13 = icmp sgt i32 %11, %12
-  %14 = select i1 %13, i32 %11, i32 %12
-  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-  %16 = icmp sgt i32 %14, %15
-  %17 = select i1 %16, i32 %14, i32 %15
-  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-  %19 = icmp sgt i32 %17, %18
-  %20 = select i1 %19, i32 %17, i32 %18
-  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-  %22 = icmp sgt i32 %20, %21
-  %23 = select i1 %22, i32 %20, i32 %21
-  %24 = select i1 %4, i32 3, i32 4
-  store i32 %24, i32* @var, align 8
-  ret i32 %23
-}
-
-define i32 @maxi8_wrong_parent(i32) {
-; CHECK-LABEL: @maxi8_wrong_parent(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    br label [[PP:%.*]]
-; CHECK:       pp:
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
-; CHECK-NEXT:    ret i32 [[TMP23]]
-;
-; AVX-LABEL: @maxi8_wrong_parent(
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX-NEXT:    br label [[PP:%.*]]
-; AVX:       pp:
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP5]], i32 undef
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; AVX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
-; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP19]]
-; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP19]]
-; AVX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], [[TMP22]]
-; AVX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP22]]
-; AVX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], [[TMP5]]
-; AVX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP5]]
-; AVX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 [[TMP22]]
-; AVX-NEXT:    ret i32 [[OP_EXTRA]]
-;
-; AVX2-LABEL: @maxi8_wrong_parent(
-; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    br label [[PP:%.*]]
-; AVX2:       pp:
-; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP5]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP5]], i32 undef
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
-; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 undef
-; AVX2-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
-; AVX2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
-; AVX2-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
-; AVX2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
-; AVX2-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
-; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[TMP18]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP21]], [[TMP22]]
-; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
-; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP19]]
-; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP19]]
-; AVX2-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], [[TMP22]]
-; AVX2-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP22]]
-; AVX2-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], [[TMP5]]
-; AVX2-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP5]]
-; AVX2-NEXT:    [[TMP30:%.*]] = select i1 [[TMP23]], i32 [[TMP21]], i32 [[TMP22]]
-; AVX2-NEXT:    ret i32 [[OP_EXTRA]]
-;
-; SKX-LABEL: @maxi8_wrong_parent(
-; SKX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
-; SKX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; SKX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; SKX-NEXT:    br label [[PP:%.*]]
-; SKX:       pp:
-; SKX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; SKX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; SKX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-; SKX-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
-; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; SKX-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
-; SKX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP8]]
-; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP8]]
-; SKX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], [[TMP9]]
-; SKX-NEXT:    [[TMP14:%.*]] = insertelement <2 x i1> undef, i1 [[TMP13]], i32 0
-; SKX-NEXT:    [[TMP15:%.*]] = insertelement <2 x i1> [[TMP14]], i1 [[TMP5]], i32 1
-; SKX-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
-; SKX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[TMP3]], i32 1
-; SKX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP9]], i32 0
-; SKX-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP4]], i32 1
-; SKX-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP17]], <2 x i32> [[TMP19]]
-; SKX-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1
-; SKX-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP7]]
-; SKX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 undef
-; SKX-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
-; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], [[TMP24]]
-; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP23]], i32 undef
-; SKX-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
-; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], [[TMP27]]
-; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP26]], i32 undef
-; SKX-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP29]], [[TMP30]]
-; SKX-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP29]], i32 undef
-; SKX-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], [[TMP8]]
-; SKX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP8]]
-; SKX-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], [[TMP9]]
-; SKX-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0
-; SKX-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], [[TMP21]]
-; SKX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 [[TMP21]]
-; SKX-NEXT:    [[TMP38:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP9]]
-; SKX-NEXT:    ret i32 [[OP_EXTRA]]
-;
-  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
-  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
-  %4 = icmp sgt i32 %2, %3
-  br label %pp
-
-pp:
-  %5 = select i1 %4, i32 %2, i32 %3
-  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-  %7 = icmp sgt i32 %5, %6
-  %8 = select i1 %7, i32 %5, i32 %6
-  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-  %10 = icmp sgt i32 %8, %9
-  %11 = select i1 %10, i32 %8, i32 %9
-  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-  %13 = icmp sgt i32 %11, %12
-  %14 = select i1 %13, i32 %11, i32 %12
-  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-  %16 = icmp sgt i32 %14, %15
-  %17 = select i1 %16, i32 %14, i32 %15
-  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
-  %19 = icmp sgt i32 %17, %18
-  %20 = select i1 %19, i32 %17, i32 %18
-  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
-  %22 = icmp sgt i32 %20, %21
-  %23 = select i1 %22, i32 %20, i32 %21
-  ret i32 %23
-}
-