[llvm] 908b753 - [SLP]Improve vectorization of PHI instructions.

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 21 12:27:27 PDT 2021


Author: Alexey Bataev
Date: 2021-06-21T12:26:24-07:00
New Revision: 908b7536615ee8694d949b60716833893c7f7049

URL: https://github.com/llvm/llvm-project/commit/908b7536615ee8694d949b60716833893c7f7049
DIFF: https://github.com/llvm/llvm-project/commit/908b7536615ee8694d949b60716833893c7f7049.diff

LOG: [SLP]Improve vectorization of PHI instructions.

Perform better analysis when trying to vectorize PHIs.
1. Do not try to vectorize vector PHIs.
2. Do deeper analysis for more profitable nodes for the vectorization.

Before we just tried to vectorize the PHIs of the same type. Patch
improves this and tries to vectorize PHIs with incoming values which
come from the same basic block, have the same and/or alternative
opcodes.

It allows to save the compile time and provides better vectorization
results in general.

Part of D57059.

Differential Revision: https://reviews.llvm.org/D103638

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d907dfbc34c1..0ffa210ffa62d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8033,10 +8033,6 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
   return false;
 }
 
-static bool PhiTypeSorterFunc(Value *V, Value *V2) {
-  return V->getType() < V2->getType();
-}
-
 /// Try and get a reduction value from a phi node.
 ///
 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
@@ -8290,6 +8286,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
+  // Maps phi nodes to the non-phi nodes found in the use tree for each phi
+  // node. Allows better to identify the chains that can be vectorized in the
+  // better way.
+  DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -8302,22 +8302,113 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!P)
         break;
 
-      if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+      // No need to analyze deleted, vectorized and non-vectorizable
+      // instructions.
+      if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
+          !P->getType()->isVectorTy())
         Incoming.push_back(P);
     }
 
-    // Sort by type.
-    llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+    // Find the corresponding non-phi nodes for better matching when trying to
+    // build the tree.
+    for (Value *V : Incoming) {
+      SmallVectorImpl<Value *> &Opcodes =
+          PHIToOpcodes.try_emplace(V).first->getSecond();
+      if (!Opcodes.empty())
+        continue;
+      SmallVector<Value *, 4> Nodes(1, V);
+      SmallPtrSet<Value *, 4> Visited;
+      while (!Nodes.empty()) {
+        auto *PHI = cast<PHINode>(Nodes.pop_back_val());
+        if (!Visited.insert(PHI).second)
+          continue;
+        for (Value *V : PHI->incoming_values()) {
+          if (auto *PHI1 = dyn_cast<PHINode>((V))) {
+            Nodes.push_back(PHI1);
+            continue;
+          }
+          Opcodes.emplace_back(V);
+        }
+      }
+    }
+
+    // Sort by type, parent, operands.
+    stable_sort(Incoming, [&PHIToOpcodes](Value *V1, Value *V2) {
+      if (V1->getType() < V2->getType())
+        return true;
+      if (V1->getType() > V2->getType())
+        return false;
+      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+      if (Opcodes1.size() < Opcodes2.size())
+        return true;
+      if (Opcodes1.size() > Opcodes2.size())
+        return false;
+      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+        // Undefs are compatible with any other value.
+        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+          continue;
+        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+            if (I1->getParent() < I2->getParent())
+              return true;
+            if (I1->getParent() > I2->getParent())
+              return false;
+            InstructionsState S = getSameOpcode({I1, I2});
+            if (S.getOpcode())
+              continue;
+            return I1->getOpcode() < I2->getOpcode();
+          }
+        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+          continue;
+        if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
+          return true;
+        if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
+          return false;
+      }
+      return false;
+    });
+
+    auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+      if (V1 == V2)
+        return true;
+      if (V1->getType() != V2->getType())
+        return false;
+      ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+      ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+      if (Opcodes1.size() != Opcodes2.size())
+        return false;
+      for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+        // Undefs are compatible with any other value.
+        if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+          continue;
+        if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+          if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+            if (I1->getParent() != I2->getParent())
+              return false;
+            InstructionsState S = getSameOpcode({I1, I2});
+            if (S.getOpcode())
+              continue;
+            return false;
+          }
+        if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+          continue;
+        if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
+          return false;
+      }
+      return true;
+    };
 
     // Try to vectorize elements base on their type.
+    SmallVector<Value *, 4> Candidates;
     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
                                            E = Incoming.end();
          IncIt != E;) {
 
-      // Look for the next elements with the same type.
+      // Look for the next elements with the same type, parent and operand
+      // kinds.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
-      while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+      while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
@@ -8335,7 +8426,20 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
-        break;
+      } else if (NumElts < 4 &&
+                 (Candidates.empty() ||
+                  Candidates.front()->getType() == (*IncIt)->getType())) {
+        Candidates.append(IncIt, std::next(IncIt, NumElts));
+      }
+      // Final attempt to vectorize phis with the same types.
+      if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
+        if (Candidates.size() > 1 &&
+            tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {
+          // Success start over because instructions might have been changed.
+          HaveVectorizedPhiNodes = true;
+          Changed = true;
+        }
+        Candidates.clear();
       }
 
       // Start over at the next instruction of a 
diff erent type (or the end).

diff  --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
index 22d6892a0f25a..6ebc67a90fe86 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -132,77 +132,58 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256-NEXT:  bb:
 ; MAX256-NEXT:    br label [[BB1:%.*]]
 ; MAX256:       bb1:
-; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> poison, half [[HVAL:%.*]], i32 0
-; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX256-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
-; MAX256-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[FVAL]], i32 3
-; MAX256-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 4
-; MAX256-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 5
-; MAX256-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 6
-; MAX256-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 7
-; MAX256-NEXT:    [[TMP13:%.*]] = fmul <8 x float> [[SHUFFLE]], [[TMP12]]
-; MAX256-NEXT:    [[TMP14:%.*]] = fadd <8 x float> zeroinitializer, [[TMP13]]
-; MAX256-NEXT:    [[TMP15:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 3
-; MAX256-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 2
-; MAX256-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 1
-; MAX256-NEXT:    [[TMP18:%.*]] = extractelement <8 x float> [[SHUFFLE]], i32 0
-; MAX256-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> poison, float [[TMP15]], i32 0
-; MAX256-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[TMP16]], i32 1
-; MAX256-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP17]], i32 2
-; MAX256-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[TMP18]], i32 3
-; MAX256-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP15]], i32 4
-; MAX256-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[TMP16]], i32 5
-; MAX256-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP17]], i32 6
-; MAX256-NEXT:    [[TMP26:%.*]] = insertelement <8 x float> [[TMP25]], float [[TMP18]], i32 7
-; MAX256-NEXT:    [[TMP27:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP28:%.*]] = fadd <8 x float> zeroinitializer, [[TMP27]]
-; MAX256-NEXT:    [[TMP29:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP30:%.*]] = fadd <8 x float> zeroinitializer, [[TMP29]]
-; MAX256-NEXT:    [[TMP31:%.*]] = fmul <8 x float> [[TMP26]], [[TMP12]]
-; MAX256-NEXT:    [[TMP32:%.*]] = fadd <8 x float> zeroinitializer, [[TMP31]]
-; MAX256-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> poison, float [[FVAL]], i32 2
-; MAX256-NEXT:    [[TMP34:%.*]] = extractelement <8 x float> [[TMP14]], i32 0
-; MAX256-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP33]], float [[TMP34]], i32 0
-; MAX256-NEXT:    [[TMP36:%.*]] = extractelement <8 x float> [[TMP14]], i32 1
-; MAX256-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP36]], i32 1
-; MAX256-NEXT:    [[TMP38:%.*]] = extractelement <8 x float> [[TMP14]], i32 4
-; MAX256-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP38]], i32 3
-; MAX256-NEXT:    [[TMP40:%.*]] = extractelement <8 x float> [[TMP14]], i32 5
-; MAX256-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP40]], i32 4
-; MAX256-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <8 x float> [[TMP41]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 3, i32 4, i32 2, i32 2>
-; MAX256-NEXT:    [[TMP42:%.*]] = extractelement <8 x float> [[TMP28]], i32 2
-; MAX256-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP42]], i32 1
-; MAX256-NEXT:    [[TMP44:%.*]] = extractelement <8 x float> [[TMP28]], i32 3
-; MAX256-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP44]], i32 2
-; MAX256-NEXT:    [[TMP46:%.*]] = extractelement <8 x float> [[TMP28]], i32 6
-; MAX256-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP46]], i32 3
-; MAX256-NEXT:    [[TMP48:%.*]] = extractelement <8 x float> [[TMP28]], i32 7
-; MAX256-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP48]], i32 4
-; MAX256-NEXT:    [[SHUFFLE6:%.*]] = shufflevector <8 x float> [[TMP49]], <8 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 4>
-; MAX256-NEXT:    [[TMP50:%.*]] = extractelement <8 x float> [[TMP30]], i32 2
-; MAX256-NEXT:    [[TMP51:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP50]], i32 1
-; MAX256-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP30]], i32 3
-; MAX256-NEXT:    [[TMP53:%.*]] = insertelement <8 x float> [[TMP51]], float [[TMP52]], i32 2
-; MAX256-NEXT:    [[TMP54:%.*]] = extractelement <8 x float> [[TMP30]], i32 6
-; MAX256-NEXT:    [[TMP55:%.*]] = insertelement <8 x float> [[TMP53]], float [[TMP54]], i32 3
-; MAX256-NEXT:    [[TMP56:%.*]] = extractelement <8 x float> [[TMP30]], i32 7
-; MAX256-NEXT:    [[TMP57:%.*]] = insertelement <8 x float> [[TMP55]], float [[TMP56]], i32 4
-; MAX256-NEXT:    [[SHUFFLE9:%.*]] = shufflevector <8 x float> [[TMP57]], <8 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 4>
-; MAX256-NEXT:    [[TMP58:%.*]] = extractelement <8 x float> [[TMP32]], i32 2
-; MAX256-NEXT:    [[TMP59:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP58]], i32 1
-; MAX256-NEXT:    [[TMP60:%.*]] = extractelement <8 x float> [[TMP32]], i32 3
-; MAX256-NEXT:    [[TMP61:%.*]] = insertelement <8 x float> [[TMP59]], float [[TMP60]], i32 2
-; MAX256-NEXT:    [[TMP62:%.*]] = extractelement <8 x float> [[TMP32]], i32 6
-; MAX256-NEXT:    [[TMP63:%.*]] = insertelement <8 x float> [[TMP61]], float [[TMP62]], i32 3
-; MAX256-NEXT:    [[TMP64:%.*]] = extractelement <8 x float> [[TMP32]], i32 7
-; MAX256-NEXT:    [[TMP65:%.*]] = insertelement <8 x float> [[TMP63]], float [[TMP64]], i32 4
-; MAX256-NEXT:    [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP65]], <8 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 4>
+; MAX256-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
+; MAX256-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
+; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> [[TMP0]], float [[I]], i32 1
+; MAX256-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I]], i32 2
+; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I]], i32 3
+; MAX256-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I]], i32 4
+; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I]], i32 5
+; MAX256-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I]], i32 6
+; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I]], i32 7
+; MAX256-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
+; MAX256-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 1
+; MAX256-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 2
+; MAX256-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 3
+; MAX256-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 4
+; MAX256-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[FVAL]], i32 5
+; MAX256-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[FVAL]], i32 6
+; MAX256-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[FVAL]], i32 7
+; MAX256-NEXT:    [[TMP16:%.*]] = fmul <8 x float> [[TMP7]], [[TMP15]]
+; MAX256-NEXT:    [[TMP17:%.*]] = fadd <8 x float> zeroinitializer, [[TMP16]]
+; MAX256-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX256-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I3]], i32 1
+; MAX256-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[I3]], i32 2
+; MAX256-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I3]], i32 3
+; MAX256-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[I3]], i32 4
+; MAX256-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[I3]], i32 5
+; MAX256-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[I3]], i32 6
+; MAX256-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[I3]], i32 7
+; MAX256-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP25]], [[TMP15]]
+; MAX256-NEXT:    [[TMP27:%.*]] = fadd <8 x float> zeroinitializer, [[TMP26]]
+; MAX256-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX256-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[I6]], i32 1
+; MAX256-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP29]], float [[I6]], i32 2
+; MAX256-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[I6]], i32 3
+; MAX256-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[I6]], i32 4
+; MAX256-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[I6]], i32 5
+; MAX256-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[I6]], i32 6
+; MAX256-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[I6]], i32 7
+; MAX256-NEXT:    [[TMP36:%.*]] = fmul <8 x float> [[TMP35]], [[TMP15]]
+; MAX256-NEXT:    [[TMP37:%.*]] = fadd <8 x float> zeroinitializer, [[TMP36]]
+; MAX256-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX256-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[I9]], i32 1
+; MAX256-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[I9]], i32 2
+; MAX256-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[I9]], i32 3
+; MAX256-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[I9]], i32 4
+; MAX256-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[I9]], i32 5
+; MAX256-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[I9]], i32 6
+; MAX256-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[I9]], i32 7
+; MAX256-NEXT:    [[TMP46:%.*]] = fmul <8 x float> [[TMP45]], [[TMP15]]
+; MAX256-NEXT:    [[TMP47:%.*]] = fadd <8 x float> zeroinitializer, [[TMP46]]
 ; MAX256-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX256-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX256-NEXT:    i32 1, label [[BB3:%.*]]
@@ -211,154 +192,74 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256:       bb3:
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb4:
-; MAX256-NEXT:    [[TMP66:%.*]] = insertelement <8 x float> poison, float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP67:%.*]] = insertelement <8 x float> [[TMP66]], float [[TMP34]], i32 0
-; MAX256-NEXT:    [[TMP68:%.*]] = extractelement <8 x float> [[TMP14]], i32 3
-; MAX256-NEXT:    [[TMP69:%.*]] = insertelement <8 x float> [[TMP67]], float [[TMP68]], i32 2
-; MAX256-NEXT:    [[TMP70:%.*]] = insertelement <8 x float> [[TMP69]], float [[TMP38]], i32 3
-; MAX256-NEXT:    [[TMP71:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
-; MAX256-NEXT:    [[TMP72:%.*]] = insertelement <8 x float> [[TMP70]], float [[TMP71]], i32 4
-; MAX256-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP72]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 4>
-; MAX256-NEXT:    [[TMP73:%.*]] = extractelement <8 x float> [[TMP28]], i32 0
-; MAX256-NEXT:    [[TMP74:%.*]] = insertelement <8 x float> [[TMP66]], float [[TMP73]], i32 0
-; MAX256-NEXT:    [[TMP75:%.*]] = insertelement <8 x float> [[TMP74]], float [[TMP44]], i32 2
-; MAX256-NEXT:    [[TMP76:%.*]] = extractelement <8 x float> [[TMP28]], i32 4
-; MAX256-NEXT:    [[TMP77:%.*]] = insertelement <8 x float> [[TMP75]], float [[TMP76]], i32 3
-; MAX256-NEXT:    [[TMP78:%.*]] = insertelement <8 x float> [[TMP77]], float [[TMP48]], i32 4
-; MAX256-NEXT:    [[SHUFFLE4:%.*]] = shufflevector <8 x float> [[TMP78]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 4>
-; MAX256-NEXT:    [[TMP79:%.*]] = extractelement <8 x float> [[TMP30]], i32 0
-; MAX256-NEXT:    [[TMP80:%.*]] = insertelement <8 x float> [[TMP66]], float [[TMP79]], i32 0
-; MAX256-NEXT:    [[TMP81:%.*]] = insertelement <8 x float> [[TMP80]], float [[TMP52]], i32 2
-; MAX256-NEXT:    [[TMP82:%.*]] = extractelement <8 x float> [[TMP30]], i32 4
-; MAX256-NEXT:    [[TMP83:%.*]] = insertelement <8 x float> [[TMP81]], float [[TMP82]], i32 3
-; MAX256-NEXT:    [[TMP84:%.*]] = insertelement <8 x float> [[TMP83]], float [[TMP56]], i32 4
-; MAX256-NEXT:    [[SHUFFLE7:%.*]] = shufflevector <8 x float> [[TMP84]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 4>
-; MAX256-NEXT:    [[TMP85:%.*]] = extractelement <8 x float> [[TMP32]], i32 0
-; MAX256-NEXT:    [[TMP86:%.*]] = insertelement <8 x float> [[TMP66]], float [[TMP85]], i32 0
-; MAX256-NEXT:    [[TMP87:%.*]] = insertelement <8 x float> [[TMP86]], float [[TMP60]], i32 2
-; MAX256-NEXT:    [[TMP88:%.*]] = extractelement <8 x float> [[TMP32]], i32 4
-; MAX256-NEXT:    [[TMP89:%.*]] = insertelement <8 x float> [[TMP87]], float [[TMP88]], i32 3
-; MAX256-NEXT:    [[TMP90:%.*]] = insertelement <8 x float> [[TMP89]], float [[TMP64]], i32 4
-; MAX256-NEXT:    [[SHUFFLE10:%.*]] = shufflevector <8 x float> [[TMP90]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 4>
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb5:
-; MAX256-NEXT:    [[TMP91:%.*]] = insertelement <8 x float> [[TMP5]], float [[TMP36]], i32 1
-; MAX256-NEXT:    [[TMP92:%.*]] = extractelement <8 x float> [[TMP14]], i32 3
-; MAX256-NEXT:    [[TMP93:%.*]] = insertelement <8 x float> [[TMP91]], float [[TMP92]], i32 2
-; MAX256-NEXT:    [[TMP94:%.*]] = insertelement <8 x float> [[TMP93]], float [[TMP40]], i32 3
-; MAX256-NEXT:    [[TMP95:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
-; MAX256-NEXT:    [[TMP96:%.*]] = insertelement <8 x float> [[TMP94]], float [[TMP95]], i32 4
-; MAX256-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x float> [[TMP96]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 2, i32 0, i32 3, i32 0, i32 4>
-; MAX256-NEXT:    [[TMP97:%.*]] = insertelement <8 x float> poison, float [[FVAL]], i32 1
-; MAX256-NEXT:    [[TMP98:%.*]] = extractelement <8 x float> [[TMP28]], i32 0
-; MAX256-NEXT:    [[TMP99:%.*]] = insertelement <8 x float> [[TMP97]], float [[TMP98]], i32 0
-; MAX256-NEXT:    [[TMP100:%.*]] = insertelement <8 x float> [[TMP99]], float [[TMP42]], i32 2
-; MAX256-NEXT:    [[TMP101:%.*]] = extractelement <8 x float> [[TMP28]], i32 4
-; MAX256-NEXT:    [[TMP102:%.*]] = insertelement <8 x float> [[TMP100]], float [[TMP101]], i32 3
-; MAX256-NEXT:    [[TMP103:%.*]] = insertelement <8 x float> [[TMP102]], float [[TMP46]], i32 4
-; MAX256-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP103]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 3, i32 1, i32 4, i32 1>
-; MAX256-NEXT:    [[TMP104:%.*]] = extractelement <8 x float> [[TMP30]], i32 0
-; MAX256-NEXT:    [[TMP105:%.*]] = insertelement <8 x float> [[TMP97]], float [[TMP104]], i32 0
-; MAX256-NEXT:    [[TMP106:%.*]] = insertelement <8 x float> [[TMP105]], float [[TMP50]], i32 2
-; MAX256-NEXT:    [[TMP107:%.*]] = extractelement <8 x float> [[TMP30]], i32 4
-; MAX256-NEXT:    [[TMP108:%.*]] = insertelement <8 x float> [[TMP106]], float [[TMP107]], i32 3
-; MAX256-NEXT:    [[TMP109:%.*]] = insertelement <8 x float> [[TMP108]], float [[TMP54]], i32 4
-; MAX256-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP109]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 3, i32 1, i32 4, i32 1>
-; MAX256-NEXT:    [[TMP110:%.*]] = extractelement <8 x float> [[TMP32]], i32 0
-; MAX256-NEXT:    [[TMP111:%.*]] = insertelement <8 x float> [[TMP97]], float [[TMP110]], i32 0
-; MAX256-NEXT:    [[TMP112:%.*]] = insertelement <8 x float> [[TMP111]], float [[TMP58]], i32 2
-; MAX256-NEXT:    [[TMP113:%.*]] = extractelement <8 x float> [[TMP32]], i32 4
-; MAX256-NEXT:    [[TMP114:%.*]] = insertelement <8 x float> [[TMP112]], float [[TMP113]], i32 3
-; MAX256-NEXT:    [[TMP115:%.*]] = insertelement <8 x float> [[TMP114]], float [[TMP62]], i32 4
-; MAX256-NEXT:    [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP115]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 3, i32 1, i32 4, i32 1>
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb2:
-; MAX256-NEXT:    [[TMP116:%.*]] = phi <8 x float> [ [[TMP14]], [[BB3]] ], [ [[SHUFFLE1]], [[BB4]] ], [ [[SHUFFLE2]], [[BB5]] ], [ [[SHUFFLE3]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP117:%.*]] = phi <8 x float> [ [[TMP28]], [[BB3]] ], [ [[SHUFFLE4]], [[BB4]] ], [ [[SHUFFLE5]], [[BB5]] ], [ [[SHUFFLE6]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP118:%.*]] = phi <8 x float> [ [[TMP30]], [[BB3]] ], [ [[SHUFFLE7]], [[BB4]] ], [ [[SHUFFLE8]], [[BB5]] ], [ [[SHUFFLE9]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP119:%.*]] = phi <8 x float> [ [[TMP32]], [[BB3]] ], [ [[SHUFFLE10]], [[BB4]] ], [ [[SHUFFLE11]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP120:%.*]] = extractelement <8 x float> [[TMP119]], i32 6
-; MAX256-NEXT:    store float [[TMP120]], float* undef, align 4
+; MAX256-NEXT:    [[TMP48:%.*]] = phi <8 x float> [ [[TMP27]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP49:%.*]] = phi <8 x float> [ [[TMP37]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP37]], [[BB5]] ], [ [[TMP37]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP50:%.*]] = phi <8 x float> [ [[TMP47]], [[BB3]] ], [ [[TMP47]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP47]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP51:%.*]] = phi <8 x float> [ [[TMP17]], [[BB3]] ], [ [[TMP17]], [[BB4]] ], [ [[TMP17]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP49]], i32 7
+; MAX256-NEXT:    store float [[TMP52]], float* undef, align 4
 ; MAX256-NEXT:    ret void
 ;
 ; MAX1024-LABEL: @phi_float32(
 ; MAX1024-NEXT:  bb:
 ; MAX1024-NEXT:    br label [[BB1:%.*]]
 ; MAX1024:       bb1:
-; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> poison, half [[HVAL:%.*]], i32 0
-; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1
-; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2
-; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3
-; MAX1024-NEXT:    [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float>
-; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
-; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <32 x float> poison, float [[FVAL:%.*]], i32 0
-; MAX1024-NEXT:    [[TMP6:%.*]] = insertelement <32 x float> [[TMP5]], float [[FVAL]], i32 1
-; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <32 x float> [[TMP6]], float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP8:%.*]] = insertelement <32 x float> [[TMP7]], float [[FVAL]], i32 3
-; MAX1024-NEXT:    [[TMP9:%.*]] = insertelement <32 x float> [[TMP8]], float [[FVAL]], i32 4
-; MAX1024-NEXT:    [[TMP10:%.*]] = insertelement <32 x float> [[TMP9]], float [[FVAL]], i32 5
-; MAX1024-NEXT:    [[TMP11:%.*]] = insertelement <32 x float> [[TMP10]], float [[FVAL]], i32 6
-; MAX1024-NEXT:    [[TMP12:%.*]] = insertelement <32 x float> [[TMP11]], float [[FVAL]], i32 7
-; MAX1024-NEXT:    [[TMP13:%.*]] = insertelement <32 x float> [[TMP12]], float [[FVAL]], i32 8
-; MAX1024-NEXT:    [[TMP14:%.*]] = insertelement <32 x float> [[TMP13]], float [[FVAL]], i32 9
-; MAX1024-NEXT:    [[TMP15:%.*]] = insertelement <32 x float> [[TMP14]], float [[FVAL]], i32 10
-; MAX1024-NEXT:    [[TMP16:%.*]] = insertelement <32 x float> [[TMP15]], float [[FVAL]], i32 11
-; MAX1024-NEXT:    [[TMP17:%.*]] = insertelement <32 x float> [[TMP16]], float [[FVAL]], i32 12
-; MAX1024-NEXT:    [[TMP18:%.*]] = insertelement <32 x float> [[TMP17]], float [[FVAL]], i32 13
-; MAX1024-NEXT:    [[TMP19:%.*]] = insertelement <32 x float> [[TMP18]], float [[FVAL]], i32 14
-; MAX1024-NEXT:    [[TMP20:%.*]] = insertelement <32 x float> [[TMP19]], float [[FVAL]], i32 15
-; MAX1024-NEXT:    [[TMP21:%.*]] = insertelement <32 x float> [[TMP20]], float [[FVAL]], i32 16
-; MAX1024-NEXT:    [[TMP22:%.*]] = insertelement <32 x float> [[TMP21]], float [[FVAL]], i32 17
-; MAX1024-NEXT:    [[TMP23:%.*]] = insertelement <32 x float> [[TMP22]], float [[FVAL]], i32 18
-; MAX1024-NEXT:    [[TMP24:%.*]] = insertelement <32 x float> [[TMP23]], float [[FVAL]], i32 19
-; MAX1024-NEXT:    [[TMP25:%.*]] = insertelement <32 x float> [[TMP24]], float [[FVAL]], i32 20
-; MAX1024-NEXT:    [[TMP26:%.*]] = insertelement <32 x float> [[TMP25]], float [[FVAL]], i32 21
-; MAX1024-NEXT:    [[TMP27:%.*]] = insertelement <32 x float> [[TMP26]], float [[FVAL]], i32 22
-; MAX1024-NEXT:    [[TMP28:%.*]] = insertelement <32 x float> [[TMP27]], float [[FVAL]], i32 23
-; MAX1024-NEXT:    [[TMP29:%.*]] = insertelement <32 x float> [[TMP28]], float [[FVAL]], i32 24
-; MAX1024-NEXT:    [[TMP30:%.*]] = insertelement <32 x float> [[TMP29]], float [[FVAL]], i32 25
-; MAX1024-NEXT:    [[TMP31:%.*]] = insertelement <32 x float> [[TMP30]], float [[FVAL]], i32 26
-; MAX1024-NEXT:    [[TMP32:%.*]] = insertelement <32 x float> [[TMP31]], float [[FVAL]], i32 27
-; MAX1024-NEXT:    [[TMP33:%.*]] = insertelement <32 x float> [[TMP32]], float [[FVAL]], i32 28
-; MAX1024-NEXT:    [[TMP34:%.*]] = insertelement <32 x float> [[TMP33]], float [[FVAL]], i32 29
-; MAX1024-NEXT:    [[TMP35:%.*]] = insertelement <32 x float> [[TMP34]], float [[FVAL]], i32 30
-; MAX1024-NEXT:    [[TMP36:%.*]] = insertelement <32 x float> [[TMP35]], float [[FVAL]], i32 31
-; MAX1024-NEXT:    [[TMP37:%.*]] = fmul <32 x float> [[SHUFFLE]], [[TMP36]]
-; MAX1024-NEXT:    [[TMP38:%.*]] = fadd <32 x float> zeroinitializer, [[TMP37]]
-; MAX1024-NEXT:    [[TMP39:%.*]] = insertelement <32 x float> poison, float [[FVAL]], i32 2
-; MAX1024-NEXT:    [[TMP40:%.*]] = extractelement <32 x float> [[TMP38]], i32 0
-; MAX1024-NEXT:    [[TMP41:%.*]] = insertelement <32 x float> [[TMP39]], float [[TMP40]], i32 0
-; MAX1024-NEXT:    [[TMP42:%.*]] = extractelement <32 x float> [[TMP38]], i32 1
-; MAX1024-NEXT:    [[TMP43:%.*]] = insertelement <32 x float> [[TMP41]], float [[TMP42]], i32 1
-; MAX1024-NEXT:    [[TMP44:%.*]] = extractelement <32 x float> [[TMP38]], i32 4
-; MAX1024-NEXT:    [[TMP45:%.*]] = insertelement <32 x float> [[TMP43]], float [[TMP44]], i32 3
-; MAX1024-NEXT:    [[TMP46:%.*]] = extractelement <32 x float> [[TMP38]], i32 5
-; MAX1024-NEXT:    [[TMP47:%.*]] = insertelement <32 x float> [[TMP45]], float [[TMP46]], i32 4
-; MAX1024-NEXT:    [[TMP48:%.*]] = extractelement <32 x float> [[TMP38]], i32 10
-; MAX1024-NEXT:    [[TMP49:%.*]] = insertelement <32 x float> [[TMP47]], float [[TMP48]], i32 5
-; MAX1024-NEXT:    [[TMP50:%.*]] = extractelement <32 x float> [[TMP38]], i32 11
-; MAX1024-NEXT:    [[TMP51:%.*]] = insertelement <32 x float> [[TMP49]], float [[TMP50]], i32 6
-; MAX1024-NEXT:    [[TMP52:%.*]] = extractelement <32 x float> [[TMP38]], i32 14
-; MAX1024-NEXT:    [[TMP53:%.*]] = insertelement <32 x float> [[TMP51]], float [[TMP52]], i32 7
-; MAX1024-NEXT:    [[TMP54:%.*]] = extractelement <32 x float> [[TMP38]], i32 15
-; MAX1024-NEXT:    [[TMP55:%.*]] = insertelement <32 x float> [[TMP53]], float [[TMP54]], i32 8
-; MAX1024-NEXT:    [[TMP56:%.*]] = extractelement <32 x float> [[TMP38]], i32 18
-; MAX1024-NEXT:    [[TMP57:%.*]] = insertelement <32 x float> [[TMP55]], float [[TMP56]], i32 9
-; MAX1024-NEXT:    [[TMP58:%.*]] = extractelement <32 x float> [[TMP38]], i32 19
-; MAX1024-NEXT:    [[TMP59:%.*]] = insertelement <32 x float> [[TMP57]], float [[TMP58]], i32 10
-; MAX1024-NEXT:    [[TMP60:%.*]] = extractelement <32 x float> [[TMP38]], i32 22
-; MAX1024-NEXT:    [[TMP61:%.*]] = insertelement <32 x float> [[TMP59]], float [[TMP60]], i32 11
-; MAX1024-NEXT:    [[TMP62:%.*]] = extractelement <32 x float> [[TMP38]], i32 23
-; MAX1024-NEXT:    [[TMP63:%.*]] = insertelement <32 x float> [[TMP61]], float [[TMP62]], i32 12
-; MAX1024-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[TMP38]], i32 26
-; MAX1024-NEXT:    [[TMP65:%.*]] = insertelement <32 x float> [[TMP63]], float [[TMP64]], i32 13
-; MAX1024-NEXT:    [[TMP66:%.*]] = extractelement <32 x float> [[TMP38]], i32 27
-; MAX1024-NEXT:    [[TMP67:%.*]] = insertelement <32 x float> [[TMP65]], float [[TMP66]], i32 14
-; MAX1024-NEXT:    [[TMP68:%.*]] = extractelement <32 x float> [[TMP38]], i32 30
-; MAX1024-NEXT:    [[TMP69:%.*]] = insertelement <32 x float> [[TMP67]], float [[TMP68]], i32 15
-; MAX1024-NEXT:    [[TMP70:%.*]] = extractelement <32 x float> [[TMP38]], i32 31
-; MAX1024-NEXT:    [[TMP71:%.*]] = insertelement <32 x float> [[TMP69]], float [[TMP70]], i32 16
-; MAX1024-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <32 x float> [[TMP71]], <32 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 2, i32 3, i32 4, i32 2, i32 2, i32 2, i32 2, i32 5, i32 6, i32 2, i32 2, i32 7, i32 8, i32 2, i32 2, i32 9, i32 10, i32 2, i32 2, i32 11, i32 12, i32 2, i32 2, i32 13, i32 14, i32 2, i32 2, i32 15, i32 16>
+; MAX1024-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
+; MAX1024-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
+; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> [[TMP0]], float [[I]], i32 1
+; MAX1024-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I]], i32 2
+; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I]], i32 3
+; MAX1024-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I]], i32 4
+; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I]], i32 5
+; MAX1024-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I]], i32 6
+; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I]], i32 7
+; MAX1024-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
+; MAX1024-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[FVAL]], i32 1
+; MAX1024-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> [[TMP9]], float [[FVAL]], i32 2
+; MAX1024-NEXT:    [[TMP11:%.*]] = insertelement <8 x float> [[TMP10]], float [[FVAL]], i32 3
+; MAX1024-NEXT:    [[TMP12:%.*]] = insertelement <8 x float> [[TMP11]], float [[FVAL]], i32 4
+; MAX1024-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> [[TMP12]], float [[FVAL]], i32 5
+; MAX1024-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[FVAL]], i32 6
+; MAX1024-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[FVAL]], i32 7
+; MAX1024-NEXT:    [[TMP16:%.*]] = fmul <8 x float> [[TMP7]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP17:%.*]] = fadd <8 x float> zeroinitializer, [[TMP16]]
+; MAX1024-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX1024-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I3]], i32 1
+; MAX1024-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP19]], float [[I3]], i32 2
+; MAX1024-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I3]], i32 3
+; MAX1024-NEXT:    [[TMP22:%.*]] = insertelement <8 x float> [[TMP21]], float [[I3]], i32 4
+; MAX1024-NEXT:    [[TMP23:%.*]] = insertelement <8 x float> [[TMP22]], float [[I3]], i32 5
+; MAX1024-NEXT:    [[TMP24:%.*]] = insertelement <8 x float> [[TMP23]], float [[I3]], i32 6
+; MAX1024-NEXT:    [[TMP25:%.*]] = insertelement <8 x float> [[TMP24]], float [[I3]], i32 7
+; MAX1024-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP25]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP27:%.*]] = fadd <8 x float> zeroinitializer, [[TMP26]]
+; MAX1024-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX1024-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[I6]], i32 1
+; MAX1024-NEXT:    [[TMP30:%.*]] = insertelement <8 x float> [[TMP29]], float [[I6]], i32 2
+; MAX1024-NEXT:    [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[I6]], i32 3
+; MAX1024-NEXT:    [[TMP32:%.*]] = insertelement <8 x float> [[TMP31]], float [[I6]], i32 4
+; MAX1024-NEXT:    [[TMP33:%.*]] = insertelement <8 x float> [[TMP32]], float [[I6]], i32 5
+; MAX1024-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> [[TMP33]], float [[I6]], i32 6
+; MAX1024-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[I6]], i32 7
+; MAX1024-NEXT:    [[TMP36:%.*]] = fmul <8 x float> [[TMP35]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP37:%.*]] = fadd <8 x float> zeroinitializer, [[TMP36]]
+; MAX1024-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX1024-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[I9]], i32 1
+; MAX1024-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[I9]], i32 2
+; MAX1024-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[I9]], i32 3
+; MAX1024-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[I9]], i32 4
+; MAX1024-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[I9]], i32 5
+; MAX1024-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[I9]], i32 6
+; MAX1024-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[I9]], i32 7
+; MAX1024-NEXT:    [[TMP46:%.*]] = fmul <8 x float> [[TMP45]], [[TMP15]]
+; MAX1024-NEXT:    [[TMP47:%.*]] = fadd <8 x float> zeroinitializer, [[TMP46]]
 ; MAX1024-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX1024-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX1024-NEXT:    i32 1, label [[BB3:%.*]]
@@ -367,64 +268,16 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX1024:       bb3:
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb4:
-; MAX1024-NEXT:    [[TMP72:%.*]] = insertelement <32 x float> poison, float [[FVAL]], i32 1
-; MAX1024-NEXT:    [[TMP73:%.*]] = insertelement <32 x float> [[TMP72]], float [[TMP40]], i32 0
-; MAX1024-NEXT:    [[TMP74:%.*]] = extractelement <32 x float> [[TMP38]], i32 3
-; MAX1024-NEXT:    [[TMP75:%.*]] = insertelement <32 x float> [[TMP73]], float [[TMP74]], i32 2
-; MAX1024-NEXT:    [[TMP76:%.*]] = insertelement <32 x float> [[TMP75]], float [[TMP44]], i32 3
-; MAX1024-NEXT:    [[TMP77:%.*]] = extractelement <32 x float> [[TMP38]], i32 7
-; MAX1024-NEXT:    [[TMP78:%.*]] = insertelement <32 x float> [[TMP76]], float [[TMP77]], i32 4
-; MAX1024-NEXT:    [[TMP79:%.*]] = extractelement <32 x float> [[TMP38]], i32 8
-; MAX1024-NEXT:    [[TMP80:%.*]] = insertelement <32 x float> [[TMP78]], float [[TMP79]], i32 5
-; MAX1024-NEXT:    [[TMP81:%.*]] = insertelement <32 x float> [[TMP80]], float [[TMP50]], i32 6
-; MAX1024-NEXT:    [[TMP82:%.*]] = extractelement <32 x float> [[TMP38]], i32 12
-; MAX1024-NEXT:    [[TMP83:%.*]] = insertelement <32 x float> [[TMP81]], float [[TMP82]], i32 7
-; MAX1024-NEXT:    [[TMP84:%.*]] = insertelement <32 x float> [[TMP83]], float [[TMP54]], i32 8
-; MAX1024-NEXT:    [[TMP85:%.*]] = extractelement <32 x float> [[TMP38]], i32 16
-; MAX1024-NEXT:    [[TMP86:%.*]] = insertelement <32 x float> [[TMP84]], float [[TMP85]], i32 9
-; MAX1024-NEXT:    [[TMP87:%.*]] = insertelement <32 x float> [[TMP86]], float [[TMP58]], i32 10
-; MAX1024-NEXT:    [[TMP88:%.*]] = extractelement <32 x float> [[TMP38]], i32 20
-; MAX1024-NEXT:    [[TMP89:%.*]] = insertelement <32 x float> [[TMP87]], float [[TMP88]], i32 11
-; MAX1024-NEXT:    [[TMP90:%.*]] = insertelement <32 x float> [[TMP89]], float [[TMP62]], i32 12
-; MAX1024-NEXT:    [[TMP91:%.*]] = extractelement <32 x float> [[TMP38]], i32 24
-; MAX1024-NEXT:    [[TMP92:%.*]] = insertelement <32 x float> [[TMP90]], float [[TMP91]], i32 13
-; MAX1024-NEXT:    [[TMP93:%.*]] = insertelement <32 x float> [[TMP92]], float [[TMP66]], i32 14
-; MAX1024-NEXT:    [[TMP94:%.*]] = extractelement <32 x float> [[TMP38]], i32 28
-; MAX1024-NEXT:    [[TMP95:%.*]] = insertelement <32 x float> [[TMP93]], float [[TMP94]], i32 15
-; MAX1024-NEXT:    [[TMP96:%.*]] = insertelement <32 x float> [[TMP95]], float [[TMP70]], i32 16
-; MAX1024-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <32 x float> [[TMP96]], <32 x float> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 4, i32 5, i32 1, i32 1, i32 6, i32 7, i32 1, i32 1, i32 8, i32 9, i32 1, i32 1, i32 10, i32 11, i32 1, i32 1, i32 12, i32 13, i32 1, i32 1, i32 14, i32 15, i32 1, i32 1, i32 16>
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb5:
-; MAX1024-NEXT:    [[TMP97:%.*]] = insertelement <32 x float> [[TMP5]], float [[TMP42]], i32 1
-; MAX1024-NEXT:    [[TMP98:%.*]] = extractelement <32 x float> [[TMP38]], i32 3
-; MAX1024-NEXT:    [[TMP99:%.*]] = insertelement <32 x float> [[TMP97]], float [[TMP98]], i32 2
-; MAX1024-NEXT:    [[TMP100:%.*]] = insertelement <32 x float> [[TMP99]], float [[TMP46]], i32 3
-; MAX1024-NEXT:    [[TMP101:%.*]] = extractelement <32 x float> [[TMP38]], i32 7
-; MAX1024-NEXT:    [[TMP102:%.*]] = insertelement <32 x float> [[TMP100]], float [[TMP101]], i32 4
-; MAX1024-NEXT:    [[TMP103:%.*]] = extractelement <32 x float> [[TMP38]], i32 8
-; MAX1024-NEXT:    [[TMP104:%.*]] = insertelement <32 x float> [[TMP102]], float [[TMP103]], i32 5
-; MAX1024-NEXT:    [[TMP105:%.*]] = insertelement <32 x float> [[TMP104]], float [[TMP48]], i32 6
-; MAX1024-NEXT:    [[TMP106:%.*]] = extractelement <32 x float> [[TMP38]], i32 12
-; MAX1024-NEXT:    [[TMP107:%.*]] = insertelement <32 x float> [[TMP105]], float [[TMP106]], i32 7
-; MAX1024-NEXT:    [[TMP108:%.*]] = insertelement <32 x float> [[TMP107]], float [[TMP52]], i32 8
-; MAX1024-NEXT:    [[TMP109:%.*]] = extractelement <32 x float> [[TMP38]], i32 16
-; MAX1024-NEXT:    [[TMP110:%.*]] = insertelement <32 x float> [[TMP108]], float [[TMP109]], i32 9
-; MAX1024-NEXT:    [[TMP111:%.*]] = insertelement <32 x float> [[TMP110]], float [[TMP56]], i32 10
-; MAX1024-NEXT:    [[TMP112:%.*]] = extractelement <32 x float> [[TMP38]], i32 20
-; MAX1024-NEXT:    [[TMP113:%.*]] = insertelement <32 x float> [[TMP111]], float [[TMP112]], i32 11
-; MAX1024-NEXT:    [[TMP114:%.*]] = insertelement <32 x float> [[TMP113]], float [[TMP60]], i32 12
-; MAX1024-NEXT:    [[TMP115:%.*]] = extractelement <32 x float> [[TMP38]], i32 24
-; MAX1024-NEXT:    [[TMP116:%.*]] = insertelement <32 x float> [[TMP114]], float [[TMP115]], i32 13
-; MAX1024-NEXT:    [[TMP117:%.*]] = insertelement <32 x float> [[TMP116]], float [[TMP64]], i32 14
-; MAX1024-NEXT:    [[TMP118:%.*]] = extractelement <32 x float> [[TMP38]], i32 28
-; MAX1024-NEXT:    [[TMP119:%.*]] = insertelement <32 x float> [[TMP117]], float [[TMP118]], i32 15
-; MAX1024-NEXT:    [[TMP120:%.*]] = insertelement <32 x float> [[TMP119]], float [[TMP68]], i32 16
-; MAX1024-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <32 x float> [[TMP120]], <32 x float> poison, <32 x i32> <i32 0, i32 1, i32 0, i32 2, i32 0, i32 3, i32 0, i32 4, i32 5, i32 0, i32 6, i32 0, i32 7, i32 0, i32 8, i32 0, i32 9, i32 0, i32 10, i32 0, i32 11, i32 0, i32 12, i32 0, i32 13, i32 0, i32 14, i32 0, i32 15, i32 0, i32 16, i32 0>
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb2:
-; MAX1024-NEXT:    [[TMP121:%.*]] = phi <32 x float> [ [[TMP38]], [[BB3]] ], [ [[SHUFFLE1]], [[BB4]] ], [ [[SHUFFLE2]], [[BB5]] ], [ [[SHUFFLE3]], [[BB1]] ]
-; MAX1024-NEXT:    [[TMP122:%.*]] = extractelement <32 x float> [[TMP121]], i32 30
-; MAX1024-NEXT:    store float [[TMP122]], float* undef, align 4
+; MAX1024-NEXT:    [[TMP48:%.*]] = phi <8 x float> [ [[TMP27]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP49:%.*]] = phi <8 x float> [ [[TMP37]], [[BB3]] ], [ [[TMP15]], [[BB4]] ], [ [[TMP37]], [[BB5]] ], [ [[TMP37]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP50:%.*]] = phi <8 x float> [ [[TMP47]], [[BB3]] ], [ [[TMP47]], [[BB4]] ], [ [[TMP15]], [[BB5]] ], [ [[TMP47]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP51:%.*]] = phi <8 x float> [ [[TMP17]], [[BB3]] ], [ [[TMP17]], [[BB4]] ], [ [[TMP17]], [[BB5]] ], [ [[TMP15]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP52:%.*]] = extractelement <8 x float> [[TMP49]], i32 7
+; MAX1024-NEXT:    store float [[TMP52]], float* undef, align 4
 ; MAX1024-NEXT:    ret void
 ;
 bb:


        


More information about the llvm-commits mailing list