[llvm] r318149 - [LV] Introduce VPBlendRecipe, VPWidenMemoryInstructionRecipe

Tue Nov 14 04:09:30 PST 2017

Author: gilr
Date: Tue Nov 14 04:09:30 2017
New Revision: 318149

URL: http://llvm.org/viewvc/llvm-project?rev=318149&view=rev
Log:
[LV] Introduce VPBlendRecipe, VPWidenMemoryInstructionRecipe

This patch is part of D38676.

The patch introduces two new Recipes to handle instructions whose vectorization
involves masking. These Recipes take VPlan-level masks in D38676, but still rely
on ILV's existing createEdgeMask(), createBlockInMask() in this patch.

VPBlendRecipe handles intra-loop phi nodes, which are vectorized as a sequence
of SELECTs. Its execute() code is refactored out of ILV::widenPHIInstruction(),
which now handles only loop-header phi nodes.

VPWidenMemoryInstructionRecipe handles load/store which are to be widened
(but are not part of an Interleave Group). In this patch it simply calls
ILV::vectorizeMemoryInstruction on execute().

Differential Revision: https://reviews.llvm.org/D39068

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/lib/Transforms/Vectorize/VPlan.h
    llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=318149&r1=318148&r2=318149&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Nov 14 04:09:30 2017
@@ -282,9 +282,12 @@ namespace {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class LoopVectorizationRequirements;
+class VPBlendRecipe;
 class VPInterleaveRecipe;
 class VPReplicateRecipe;
 class VPWidenIntOrFpInductionRecipe;
+class VPWidenRecipe;
+class VPWidenMemoryInstructionRecipe;
 
 } // end anonymous namespace
 
@@ -452,6 +455,10 @@ public:
   /// mask for the block BB.
   VectorParts createBlockInMask(BasicBlock *BB);
 
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
@@ -504,6 +511,13 @@ public:
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
 
+  /// Vectorize Load and Store instructions,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr);
+
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -556,17 +570,10 @@ protected:
   /// represented as.
   void truncateToMinimalBitwidths();
 
-  /// A helper function that computes the predicate of the edge between SRC
-  /// and DST.
-  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
 
-  /// Vectorize Load and Store instructions,
-  virtual void vectorizeMemoryInstruction(Instruction *Instr);
-
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -647,10 +654,6 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
-  /// \brief Set the debug location in the builder using the debug location in
-  /// the instruction.
-  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
-
   /// The original loop.
   Loop *OrigLoop;
 
@@ -2295,6 +2298,11 @@ private:
   /// to \p Range.End.
   VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
 
+  // Check if \I is a memory instruction to be widened for \p Range.Start and
+  // potentially masked.
+  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
+                                                   VFRange &Range);
+
   /// Check if an induction recipe should be constructed for \I within the given
   /// VF \p Range. If so build and return it. If not, return null. \p Range.End
   /// may be decreased to ensure same decision from \p Range.Start to
@@ -2302,6 +2310,11 @@ private:
   VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
                                                         VFRange &Range);
 
+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+  /// a sequence of select instructions as the vectorizer currently performs
+  /// full if-conversion.
+  VPBlendRecipe *tryToBlend(Instruction *I);
+
   /// Check if \p I can be widened within the given VF \p Range. If \p I can be
   /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
   /// extended to include \p I or else build a new VPWidenRecipe for it and
@@ -4497,77 +4510,6 @@ void InnerLoopVectorizer::sinkScalarOper
   } while (Changed);
 }
 
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
-  // Look for cached value.
-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
-  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
-  if (ECEntryIt != EdgeMaskCache.end())
-    return ECEntryIt->second;
-
-  VectorParts SrcMask = createBlockInMask(Src);
-
-  // The terminator has to be a branch inst!
-  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
-  assert(BI && "Unexpected terminator found");
-
-  if (!BI->isConditional())
-    return EdgeMaskCache[Edge] = SrcMask;
-
-  VectorParts EdgeMask(UF);
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
-    if (BI->getSuccessor(0) != Dst)
-      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
-
-    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
-      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
-
-    EdgeMask[Part] = EdgeMaskPart;
-  }
-
-  return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
-  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
-  // Look for cached value.
-  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
-  if (BCEntryIt != BlockMaskCache.end())
-    return BCEntryIt->second;
-
-  // All-one mask is modelled as no-mask following the convention for masked
-  // load/store/gather/scatter. Initialize BlockMask to no-mask.
-  VectorParts BlockMask(UF);
-  for (unsigned Part = 0; Part < UF; ++Part)
-    BlockMask[Part] = nullptr;
-
-  // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
-    return BlockMaskCache[BB] = BlockMask;
-
-  // This is the block mask. We OR all incoming edges.
-  for (auto *Predecessor : predecessors(BB)) {
-    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
-    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
-      return BlockMaskCache[BB] = EdgeMask;
-
-    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
-      BlockMask = EdgeMask;
-      continue;
-    }
-
-    for (unsigned Part = 0; Part < UF; ++Part)
-      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
-  }
-
-  return BlockMaskCache[BB] = BlockMask;
-}
-
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                               unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
@@ -4588,43 +4530,6 @@ void InnerLoopVectorizer::widenPHIInstru
   }
 
   setDebugLocFromInst(Builder, P);
-  // Check for PHI nodes that are lowered to vector selects.
-  if (P->getParent() != OrigLoop->getHeader()) {
-    // We know that all PHIs in non-header blocks are converted into
-    // selects, so we don't have to worry about the insertion order and we
-    // can just use the builder.
-    // At this point we generate the predication tree. There may be
-    // duplications since this is a simple recursive scan, but future
-    // optimizations will clean it up.
-
-    unsigned NumIncoming = P->getNumIncomingValues();
-
-    // Generate a sequence of selects of the form:
-    // SELECT(Mask3, In3,
-    //      SELECT(Mask2, In2,
-    //                   ( ...)))
-    VectorParts Entry(UF);
-    for (unsigned In = 0; In < NumIncoming; In++) {
-      VectorParts Cond =
-          createEdgeMask(P->getIncomingBlock(In), P->getParent());
-
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
-        assert((Cond[Part] || NumIncoming == 1) &&
-               "Multiple predecessors with one predecessor having a full mask");
-        if (In == 0)
-          Entry[Part] = In0; // Initialize with the first incoming value.
-        else
-          // Select between the current value and the previous incoming edge
-          // based on the incoming mask.
-          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
-                                             "predphi");
-      }
-    }
-    for (unsigned Part = 0; Part < UF; ++Part)
-      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
-    return;
-  }
 
   // This PHINode must be an induction variable.
   // Make sure that we know about it.
@@ -4848,10 +4753,6 @@ void InnerLoopVectorizer::widenInstructi
     break;
   }
 
-  case Instruction::Store:
-  case Instruction::Load:
-    vectorizeMemoryInstruction(&I);
-    break;
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -4956,7 +4857,7 @@ void InnerLoopVectorizer::widenInstructi
   }
 
   default:
-    // All other instructions are scalarized.
+    // This instruction is not vectorized by simple widening.
     DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
@@ -7827,6 +7728,82 @@ public:
   }
 };
 
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
+class VPBlendRecipe : public VPRecipeBase {
+private:
+  PHINode *Phi;
+
+public:
+  VPBlendRecipe(PHINode *Phi) : VPRecipeBase(VPBlendSC), Phi(Phi) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override {
+    State.ILV->setDebugLocFromInst(State.Builder, Phi);
+    // We know that all PHIs in non-header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = Phi->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    InnerLoopVectorizer::VectorParts Entry(State.UF);
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      InnerLoopVectorizer::VectorParts Cond =
+        State.ILV->createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
+
+      for (unsigned Part = 0; Part < State.UF; ++Part) {
+        Value *In0 =
+          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+        assert((Cond[Part] || NumIncoming == 1) &&
+               "Multiple predecessors with one predecessor having a full mask");
+        if (In == 0)
+          Entry[Part] = In0; // Initialize with the first incoming value.
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[Part] = State.Builder.CreateSelect(Cond[Part], In0, Entry[Part],
+                                                   "predphi");
+      }
+    }
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+  }
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent) const override {
+    O << " +\n" << Indent << "\"BLEND ";
+    Phi->printAsOperand(O, false);
+    O << " =";
+    if (Phi->getNumIncomingValues() == 1) {
+      // Not a User of any mask: not really blending, this is a
+      // single-predecessor phi.
+      O << " ";
+      Phi->getIncomingValue(0)->printAsOperand(O, false);
+    } else {
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; ++I) {
+        O << " ";
+        Phi->getIncomingValue(I)->printAsOperand(O, false);
+        O << "/";
+        Phi->getIncomingBlock(I)->printAsOperand(O, false);
+      }
+    }
+    O << "\\l\"";
+
+  }
+};
+
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
 /// or stores into one wide load/store and shuffles.
 class VPInterleaveRecipe : public VPRecipeBase {
@@ -7970,6 +7947,31 @@ public:
   }
 };
 
+/// A Recipe for widening load/store operations.
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+private:
+  Instruction &Instr;
+
+public:
+  VPWidenMemoryInstructionRecipe(Instruction &Instr)
+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
+  }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override {
+    State.ILV->vectorizeMemoryInstruction(&Instr);
+  }
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent) const override {
+    O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+    O << "\\l\"";
+  }
+};
 } // end anonymous namespace
 
 bool LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -7999,6 +8001,77 @@ void LoopVectorizationPlanner::buildVPla
   }
 }
 
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+  // Look for cached value.
+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+  if (ECEntryIt != EdgeMaskCache.end())
+    return ECEntryIt->second;
+
+  VectorParts SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (!BI->isConditional())
+    return EdgeMaskCache[Edge] = SrcMask;
+
+  VectorParts EdgeMask(UF);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+    if (BI->getSuccessor(0) != Dst)
+      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
+
+    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+
+    EdgeMask[Part] = EdgeMaskPart;
+  }
+
+  return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Look for cached value.
+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+  if (BCEntryIt != BlockMaskCache.end())
+    return BCEntryIt->second;
+
+  // All-one mask is modelled as no-mask following the convention for masked
+  // load/store/gather/scatter. Initialize BlockMask to no-mask.
+  VectorParts BlockMask(UF);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = nullptr;
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB)
+    return BlockMaskCache[BB] = BlockMask;
+
+  // This is the block mask. We OR all incoming edges.
+  for (auto *Predecessor : predecessors(BB)) {
+    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
+    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
+      return BlockMaskCache[BB] = EdgeMask;
+
+    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
+      BlockMask = EdgeMask;
+      continue;
+    }
+
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
+  }
+
+  return BlockMaskCache[BB] = BlockMask;
+}
+
 VPInterleaveRecipe *
 LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
                                                 VFRange &Range) {
@@ -8026,6 +8099,32 @@ LoopVectorizationPlanner::tryToInterleav
   return new VPInterleaveRecipe(IG);
 }
 
+VPWidenMemoryInstructionRecipe *
+LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range) {
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return nullptr;
+
+  auto willWiden = [&](unsigned VF) -> bool {
+    if (VF == 1)
+      return false;
+    if (CM.isScalarAfterVectorization(I, VF) ||
+        CM.isProfitableToScalarize(I, VF))
+      return false;
+    LoopVectorizationCostModel::InstWidening Decision =
+        CM.getWideningDecision(I, VF);
+    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+           "CM decision should be taken at this point.");
+    assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+           "Interleave memory opportunity should be caught earlier.");
+    return Decision != LoopVectorizationCostModel::CM_Scalarize;
+  };
+
+  if (!getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  return new VPWidenMemoryInstructionRecipe(*I);
+}
+
 VPWidenIntOrFpInductionRecipe *
 LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
                                                  VFRange &Range) {
@@ -8060,6 +8159,14 @@ LoopVectorizationPlanner::tryToOptimizeI
   return nullptr;
 }
 
+VPBlendRecipe *LoopVectorizationPlanner::tryToBlend(Instruction *I) {
+  PHINode *Phi = dyn_cast<PHINode>(I);
+  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
+    return nullptr;
+
+  return new VPBlendRecipe(Phi);
+}
+
 bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
                                           VFRange &Range) {
   if (Legal->isScalarWithPredication(I))
@@ -8313,11 +8420,21 @@ std::unique_ptr<VPlan> LoopVectorization
         continue;
       }
 
+      // Check if Instr is a memory operation that should be widened.
+      if ((Recipe = tryToWidenMemory(Instr, Range))) {
+        VPBB->appendRecipe(Recipe);
+        continue;
+      }
+
       // Check if Instr should form some PHI recipe.
       if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
         VPBB->appendRecipe(Recipe);
         continue;
       }
+      if ((Recipe = tryToBlend(Instr))) {
+        VPBB->appendRecipe(Recipe);
+        continue;
+      }
       if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
         VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
         continue;

Modified: llvm/trunk/lib/Transforms/Vectorize/VPlan.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VPlan.h?rev=318149&r1=318148&r2=318149&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VPlan.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VPlan.h Tue Nov 14 04:09:30 2017
@@ -452,11 +452,13 @@ public:
   /// SubclassID field of the VPRecipeBase objects. They are used for concrete
   /// type identification.
   using VPRecipeTy = enum {
+    VPBlendSC,
     VPBranchOnMaskSC,
     VPInterleaveSC,
     VPPredInstPHISC,
     VPReplicateSC,
     VPWidenIntOrFpInductionSC,
+    VPWidenMemoryInstructionSC,
     VPWidenPHISC,
     VPWidenSC,
   };

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll?rev=318149&r1=318148&r2=318149&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/x86-predication.ll Tue Nov 14 04:09:30 2017
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -58,3 +59,40 @@ for.end:
   %tmp8 = phi i32 [ %tmp7, %for.inc ]
   ret i32 %tmp8
 }
+
+; This test ensures that a load, which would have been widened otherwise is
+; instead scalarized if Cost-Model so decided as part of its
+; sink-scalar-operands optimization for predicated instructions.
+;
+; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.udiv.continue:
+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %i7 = mul i64 %i, 777
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp4 = udiv i32 %tmp2, %x
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}