[llvm] b55d4c2 - Revert "[LV] Remove `LoopVectorizationCostModel::useEmulatedMaskMemRefHack()`"

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 9 12:03:01 PST 2022


Author: David Green
Date: 2022-02-09T20:02:54Z
New Revision: b55d4c2ad8ea083f64f9cd851ac28bfd9a1a020c

URL: https://github.com/llvm/llvm-project/commit/b55d4c2ad8ea083f64f9cd851ac28bfd9a1a020c
DIFF: https://github.com/llvm/llvm-project/commit/b55d4c2ad8ea083f64f9cd851ac28bfd9a1a020c.diff

LOG: Revert "[LV] Remove `LoopVectorizationCostModel::useEmulatedMaskMemRefHack()`"

This reverts commit 77a0da926c9ea86afa9baf28158d79c7678fc6b9 as we've
received multiple reports of this significantly impacting performance,
in ways that don't seem to just be target specific cost models going
wrong. I would offer some reproducers, but the test changes here seem to
be full of them!

Reverting for now and hopefully we can remove the "hack" more carefully
as we go.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
    llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
    llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
    llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
    llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
    llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
    llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
    llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
    llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
    llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
    llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
    llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
    llvm/test/Transforms/LoopVectorize/optsize.ll
    llvm/test/Transforms/LoopVectorize/tripcount.ll
    llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79d3d0a815a4..54c3089c62fd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -307,6 +307,11 @@ static cl::opt<bool> InterleaveSmallLoopScalarReduction(
     cl::desc("Enable interleaving for loops with small iteration counts that "
              "contain scalar reductions to expose ILP."));
 
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+    cl::desc("Max number of stores to be predicated behind an if."));
+
 static cl::opt<bool> EnableIndVarRegisterHeur(
     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
     cl::desc("Count the induction variable only once when interleaving"));
@@ -1773,6 +1778,10 @@ class LoopVectorizationCostModel {
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
 
+  /// Returns true if an artificially high cost for emulated masked memrefs
+  /// should be used.
+  bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
+
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
@@ -6421,6 +6430,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   return RUs;
 }
 
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
+                                                           ElementCount VF) {
+  // TODO: Cost model for emulated masked load/store is completely
+  // broken. This hack guides the cost model to use an artificially
+  // high enough value to practically disable vectorization with such
+  // operations, except where previously deployed legality hack allowed
+  // using very low cost values. This is to avoid regressions coming simply
+  // from moving "masked load/store" check from legality to cost model.
+  // Masked Load/Gather emulation was previously never allowed.
+  // Limited number of Masked Store/Scatter emulation was allowed.
+  assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
+  return isa<LoadInst>(I) ||
+         (isa<StoreInst>(I) &&
+          NumPredStores > NumberOfStoresToPredicate);
+}
+
 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
   // If we aren't vectorizing the loop, or if we've already collected the
   // instructions to scalarize, there's nothing to do. Collection may already
@@ -6446,7 +6471,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount if scalable, because that would lead to
         // invalid scalarization costs.
-        if (!VF.isScalable() &&
+        // Do not apply discount logic if hacked cost is needed
+        // for emulated masked memrefs.
+        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
         // Remember that BB will remain after vectorization.
@@ -6702,6 +6729,11 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
         /*Insert=*/false, /*Extract=*/true);
     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+
+    if (useEmulatedMaskMemRefHack(I, VF))
+      // Artificially setting to a high enough value to practically disable
+      // vectorization with such operations.
+      Cost = 3000000;
   }
 
   return Cost;

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
index c52755b7d65c..62412a5d1af0 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 22 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 22 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 9 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 18 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX1: LV: Found an estimated cost of 36 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 9 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 18 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 36 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
@@ -50,8 +50,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction:   %valB.loaded = load i32, i32* %inB, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
index b38026c824b5..b8eba8b0327b 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
@@ -17,30 +17,30 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX1: LV: Found an estimated cost of 40 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
@@ -50,8 +50,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; AVX512: LV: Found an estimated cost of 12 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; AVX512: LV: Found an estimated cost of 24 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction:   %valB.loaded = load i64, i64* %inB, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
index 184e23a0128b..d6bfdf9d3848 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll
@@ -89,30 +89,30 @@ for.end:
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 
 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
@@ -164,17 +164,17 @@ for.end:
 ; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test"
 ;
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 
 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test"
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 7 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 
 define void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) {

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
index 224dd75a4dc5..5f67026737fc 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -89,17 +89,17 @@ for.end:
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %0, i16* %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i16 %0, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i16 %0, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 23 for VF 8 For instruction:   store i16 %0, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 50 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 
 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
 ;
@@ -107,16 +107,16 @@ for.end:
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 8 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction:   store i16 %0, i16* %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 27 for VF 16 For instruction:   store i16 %2, i16* %arrayidx7, align 2
 
 define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
 entry:

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
index 2722a52c3d96..c8c3078f1625 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll
@@ -16,37 +16,37 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE2: LV: Found an estimated cost of 16 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; SSE42: LV: Found an estimated cost of 16 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 17 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX1: LV: Found an estimated cost of 34 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 17 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 34 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 17 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
-; AVX2-FASTGATHER: LV: Found an estimated cost of 34 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i16, i16* %inB, align 2
 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i16, i16* %inB, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
index 16c00cfc03b5..f74c9f044d0b 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i32.ll
@@ -16,16 +16,16 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE2: LV: Found an estimated cost of 22 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
-; SSE42: LV: Found an estimated cost of 22 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i32, i32* %inB, align 4
 ; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %valB.loaded = load i32, i32* %inB, align 4

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
index 1baeff242304..c5a7825348e9 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i64.ll
@@ -16,16 +16,16 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 10 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
-; SSE42: LV: Found an estimated cost of 20 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i64, i64* %inB, align 8
 ; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i64, i64* %inB, align 8

diff  --git a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
index 99d0f28a03f8..fc540da58700 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll
@@ -16,37 +16,37 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE2: LV: Found an estimated cost of 23 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE2: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ;
 ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 5 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 11 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; SSE42: LV: Found an estimated cost of 23 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; SSE42: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 16 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX1: LV: Found an estimated cost of 33 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ;
 ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-SLOWGATHER: LV: Found an estimated cost of 33 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ;
 ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
-; AVX2-FASTGATHER: LV: Found an estimated cost of 33 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
+; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %valB.loaded = load i8, i8* %inB, align 1
 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   %valB.loaded = load i8, i8* %inB, align 1

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index 8ce310962b48..bf0aba1931d1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -scalable-vectorization=off -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s
 
 ; NOTE: These tests aren't really target-specific, but it's convenient to target AArch64
@@ -10,43 +9,21 @@ target triple = "aarch64-linux-gnu"
 ; we don't artificially create new predicated blocks for the load.
 define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT:    [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4
+; CHECK-NOT:     load i32, i32* %src, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]])
+; CHECK-NEXT:    [[IDX_NEXT]] = add i64 [[IDX]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec
+; CHECK-NEXT:    br i1 [[CMP]], label %middle.block, label %vector.body
 
 entry:
   br label %for.body
@@ -70,108 +47,18 @@ for.end:                                          ; preds = %for.body, %entry
 ; and the original condition.
 define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_uniform_load(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DST1:%.*]] = bitcast i32* [[DST:%.*]] to i8*
-; CHECK-NEXT:    [[COND3:%.*]] = bitcast i32* [[COND:%.*]] to i8*
-; CHECK-NEXT:    [[SRC6:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 [[N:%.*]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[COND]], i64 [[N]]
-; CHECK-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT:    [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[SRC]], i64 1
-; CHECK-NEXT:    [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP45]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[COND3]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP78]]
-; CHECK-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[SRC6]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:         [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0
+; CHECK-NEXT:    [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT19:%.*]], [[PRED_LOAD_CONTINUE18:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX12]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope !4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT:    [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK:         [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[SRC]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
-; CHECK:       pred.load.if13:
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[SRC]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP12]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.continue14:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
-; CHECK:       pred.load.if15:
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[SRC]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
-; CHECK:       pred.load.continue16:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP17]], [[PRED_LOAD_IF15]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18]]
-; CHECK:       pred.load.if17:
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[SRC]], align 4, !alias.scope !7
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
-; CHECK:       pred.load.continue18:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP21]], [[PRED_LOAD_IF17]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP23]], <4 x i32> zeroinitializer, <4 x i32> [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <4 x i1> [[TMP6]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[PREDPHI]], <4 x i32>* [[TMP27]], i32 4, <4 x i1> [[TMP25]]), !alias.scope !9, !noalias !11
-; CHECK-NEXT:    [[INDEX_NEXT19]] = add i64 [[INDEX12]], 4
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP29]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP30]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef)
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index d13942e85466..def98e03030f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -25,22 +25,22 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; AVX512-NEXT:  iter.check:
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
-; AVX512-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]]
+; AVX512-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX8]]
 ; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
 ; AVX512-NEXT:    [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX8]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]]
 ; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef)
 ; AVX512-NEXT:    [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX8]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]])
-; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX7]], 16
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX8]], 16
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
 ; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
 ; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
@@ -55,7 +55,7 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]])
-; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX7]], 32
+; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX8]], 32
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
 ; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4
@@ -70,7 +70,7 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]])
-; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX7]], 48
+; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX8]], 48
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
 ; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4
@@ -85,7 +85,7 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]])
-; AVX512-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX7]], 64
+; AVX512-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX8]], 64
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
 ; AVX512-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX512:       for.end:
@@ -95,8 +95,8 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE27:%.*]] ]
-; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]]
+; FVW2-NEXT:    [[INDEX17:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX17]]
 ; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
 ; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
@@ -112,7 +112,7 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:    [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer
 ; FVW2-NEXT:    [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer
 ; FVW2-NEXT:    [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX7]]
+; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX17]]
 ; FVW2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
 ; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison)
 ; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2
@@ -128,105 +128,33 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:    [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64>
 ; FVW2-NEXT:    [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64>
 ; FVW2-NEXT:    [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64>
-; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP8]], i64 0
-; FVW2-NEXT:    br i1 [[TMP24]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP20]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], i64 [[TMP25]]
-; FVW2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP29:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP8]], i64 1
-; FVW2-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; FVW2:       pred.load.if14:
-; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP31]]
-; FVW2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4
-; FVW2-NEXT:    [[TMP34:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP33]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; FVW2:       pred.load.continue15:
-; FVW2-NEXT:    [[TMP35:%.*]] = phi <2 x float> [ [[TMP29]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP34]], [[PRED_LOAD_IF14]] ]
-; FVW2-NEXT:    [[TMP36:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0
-; FVW2-NEXT:    br i1 [[TMP36]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; FVW2:       pred.load.if16:
-; FVW2-NEXT:    [[TMP37:%.*]] = extractelement <2 x i64> [[TMP21]], i64 0
-; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP37]]
-; FVW2-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP38]], align 4
-; FVW2-NEXT:    [[TMP40:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; FVW2:       pred.load.continue17:
-; FVW2-NEXT:    [[TMP41:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP40]], [[PRED_LOAD_IF16]] ]
-; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP9]], i64 1
-; FVW2-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; FVW2:       pred.load.if18:
-; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i64> [[TMP21]], i64 1
-; FVW2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP43]]
-; FVW2-NEXT:    [[TMP45:%.*]] = load float, float* [[TMP44]], align 4
-; FVW2-NEXT:    [[TMP46:%.*]] = insertelement <2 x float> [[TMP41]], float [[TMP45]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; FVW2:       pred.load.continue19:
-; FVW2-NEXT:    [[TMP47:%.*]] = phi <2 x float> [ [[TMP41]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP46]], [[PRED_LOAD_IF18]] ]
-; FVW2-NEXT:    [[TMP48:%.*]] = extractelement <2 x i1> [[TMP10]], i64 0
-; FVW2-NEXT:    br i1 [[TMP48]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; FVW2:       pred.load.if20:
-; FVW2-NEXT:    [[TMP49:%.*]] = extractelement <2 x i64> [[TMP22]], i64 0
-; FVW2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP49]]
-; FVW2-NEXT:    [[TMP51:%.*]] = load float, float* [[TMP50]], align 4
-; FVW2-NEXT:    [[TMP52:%.*]] = insertelement <2 x float> poison, float [[TMP51]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; FVW2:       pred.load.continue21:
-; FVW2-NEXT:    [[TMP53:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE19]] ], [ [[TMP52]], [[PRED_LOAD_IF20]] ]
-; FVW2-NEXT:    [[TMP54:%.*]] = extractelement <2 x i1> [[TMP10]], i64 1
-; FVW2-NEXT:    br i1 [[TMP54]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
-; FVW2:       pred.load.if22:
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i64> [[TMP22]], i64 1
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP55]]
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP57]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
-; FVW2:       pred.load.continue23:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ [[TMP53]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP58]], [[PRED_LOAD_IF22]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
-; FVW2:       pred.load.if24:
-; FVW2-NEXT:    [[TMP61:%.*]] = extractelement <2 x i64> [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP61]]
-; FVW2-NEXT:    [[TMP63:%.*]] = load float, float* [[TMP62]], align 4
-; FVW2-NEXT:    [[TMP64:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
-; FVW2:       pred.load.continue25:
-; FVW2-NEXT:    [[TMP65:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP64]], [[PRED_LOAD_IF24]] ]
-; FVW2-NEXT:    [[TMP66:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1
-; FVW2-NEXT:    br i1 [[TMP66]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27]]
-; FVW2:       pred.load.if26:
-; FVW2-NEXT:    [[TMP67:%.*]] = extractelement <2 x i64> [[TMP23]], i64 1
-; FVW2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP67]]
-; FVW2-NEXT:    [[TMP69:%.*]] = load float, float* [[TMP68]], align 4
-; FVW2-NEXT:    [[TMP70:%.*]] = insertelement <2 x float> [[TMP65]], float [[TMP69]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
-; FVW2:       pred.load.continue27:
-; FVW2-NEXT:    [[TMP71:%.*]] = phi <2 x float> [ [[TMP65]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP70]], [[PRED_LOAD_IF26]] ]
-; FVW2-NEXT:    [[TMP72:%.*]] = fadd <2 x float> [[TMP35]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP73:%.*]] = fadd <2 x float> [[TMP47]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP74:%.*]] = fadd <2 x float> [[TMP59]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP71]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]]
-; FVW2-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP72]], <2 x float>* [[TMP77]], i32 4, <2 x i1> [[TMP8]])
-; FVW2-NEXT:    [[TMP78:%.*]] = getelementptr float, float* [[TMP76]], i64 2
-; FVW2-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP73]], <2 x float>* [[TMP79]], i32 4, <2 x i1> [[TMP9]])
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr float, float* [[TMP76]], i64 4
-; FVW2-NEXT:    [[TMP81:%.*]] = bitcast float* [[TMP80]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP74]], <2 x float>* [[TMP81]], i32 4, <2 x i1> [[TMP10]])
-; FVW2-NEXT:    [[TMP82:%.*]] = getelementptr float, float* [[TMP76]], i64 6
-; FVW2-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <2 x float>*
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP75]], <2 x float>* [[TMP83]], i32 4, <2 x i1> [[TMP11]])
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; FVW2-NEXT:    br i1 [[TMP84]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FVW2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]]
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]]
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]]
+; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX17]]
+; FVW2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]])
+; FVW2-NEXT:    [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2
+; FVW2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]])
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4
+; FVW2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]])
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6
+; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]])
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX17]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -437,186 +365,40 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %tr
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
-; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
+; FVW2-NEXT:    [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
-; FVW2-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32
-; FVW2-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48
-; FVW2-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64
-; FVW2-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80
-; FVW2-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112
-; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4
-; FVW2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0
-; FVW2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1
-; FVW2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4
-; FVW2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4
-; FVW2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
-; FVW2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4
-; FVW2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4
-; FVW2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4
-; FVW2-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0
-; FVW2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer
-; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer
-; FVW2-NEXT:    [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP36]], align 4
-; FVW2-NEXT:    [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; FVW2:       pred.load.if8:
-; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4
-; FVW2-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; FVW2:       pred.load.continue9:
-; FVW2-NEXT:    [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ]
-; FVW2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
-; FVW2:       pred.load.if10:
-; FVW2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP47:%.*]] = load float, float* [[TMP46]], align 4
-; FVW2-NEXT:    [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
-; FVW2:       pred.load.continue11:
-; FVW2-NEXT:    [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ]
-; FVW2-NEXT:    [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
-; FVW2:       pred.load.if12:
-; FVW2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP52:%.*]] = load float, float* [[TMP51]], align 4
-; FVW2-NEXT:    [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
-; FVW2:       pred.load.continue13:
-; FVW2-NEXT:    [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ]
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; FVW2:       pred.load.if14:
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; FVW2:       pred.load.continue15:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; FVW2:       pred.load.if16:
-; FVW2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP61]], align 4
-; FVW2-NEXT:    [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; FVW2:       pred.load.continue17:
-; FVW2-NEXT:    [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ]
-; FVW2-NEXT:    [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; FVW2:       pred.load.if18:
-; FVW2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP67:%.*]] = load float, float* [[TMP66]], align 4
-; FVW2-NEXT:    [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; FVW2:       pred.load.continue19:
-; FVW2-NEXT:    [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ]
-; FVW2-NEXT:    [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; FVW2:       pred.load.if20:
-; FVW2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP72:%.*]] = load float, float* [[TMP71]], align 4
-; FVW2-NEXT:    [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; FVW2:       pred.load.continue21:
-; FVW2-NEXT:    [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ]
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
+; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0
-; FVW2-NEXT:    store float [[TMP81]], float* [[TMP80]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0
+; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; FVW2:       pred.store.if22:
-; FVW2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1
-; FVW2-NEXT:    store float [[TMP84]], float* [[TMP83]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; FVW2:       pred.store.continue23:
-; FVW2-NEXT:    [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; FVW2:       pred.store.if24:
-; FVW2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0
-; FVW2-NEXT:    store float [[TMP87]], float* [[TMP86]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; FVW2:       pred.store.continue25:
-; FVW2-NEXT:    [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; FVW2:       pred.store.if26:
-; FVW2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1
-; FVW2-NEXT:    store float [[TMP90]], float* [[TMP89]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; FVW2:       pred.store.continue27:
-; FVW2-NEXT:    [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; FVW2:       pred.store.if28:
-; FVW2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0
-; FVW2-NEXT:    store float [[TMP93]], float* [[TMP92]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; FVW2:       pred.store.continue29:
-; FVW2-NEXT:    [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; FVW2:       pred.store.if30:
-; FVW2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1
-; FVW2-NEXT:    store float [[TMP96]], float* [[TMP95]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; FVW2:       pred.store.continue31:
-; FVW2-NEXT:    [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; FVW2:       pred.store.if32:
-; FVW2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0
-; FVW2-NEXT:    store float [[TMP99]], float* [[TMP98]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; FVW2:       pred.store.continue33:
-; FVW2-NEXT:    [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.if34:
-; FVW2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1
-; FVW2-NEXT:    store float [[TMP102]], float* [[TMP101]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.continue35:
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1
+; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -828,186 +610,40 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noali
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE34:%.*]] ]
+; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
 ; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
-; FVW2-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32
-; FVW2-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48
-; FVW2-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64
-; FVW2-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80
-; FVW2-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112
-; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4
-; FVW2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0
-; FVW2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1
-; FVW2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4
-; FVW2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4
-; FVW2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
-; FVW2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4
-; FVW2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4
-; FVW2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4
-; FVW2-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0
-; FVW2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer
-; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer
-; FVW2-NEXT:    [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP36]], align 4
-; FVW2-NEXT:    [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; FVW2:       pred.load.if7:
-; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4
-; FVW2-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; FVW2:       pred.load.continue8:
-; FVW2-NEXT:    [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF7]] ]
-; FVW2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; FVW2:       pred.load.if9:
-; FVW2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP47:%.*]] = load float, float* [[TMP46]], align 4
-; FVW2-NEXT:    [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; FVW2:       pred.load.continue10:
-; FVW2-NEXT:    [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE8]] ], [ [[TMP48]], [[PRED_LOAD_IF9]] ]
-; FVW2-NEXT:    [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; FVW2:       pred.load.if11:
-; FVW2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP52:%.*]] = load float, float* [[TMP51]], align 4
-; FVW2-NEXT:    [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; FVW2:       pred.load.continue12:
-; FVW2-NEXT:    [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP53]], [[PRED_LOAD_IF11]] ]
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP55]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
-; FVW2:       pred.load.if13:
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; FVW2:       pred.load.continue14:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP58]], [[PRED_LOAD_IF13]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
-; FVW2:       pred.load.if15:
-; FVW2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP61]], align 4
-; FVW2-NEXT:    [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
-; FVW2:       pred.load.continue16:
-; FVW2-NEXT:    [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP63]], [[PRED_LOAD_IF15]] ]
-; FVW2-NEXT:    [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
-; FVW2:       pred.load.if17:
-; FVW2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP67:%.*]] = load float, float* [[TMP66]], align 4
-; FVW2-NEXT:    [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
-; FVW2:       pred.load.continue18:
-; FVW2-NEXT:    [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE16]] ], [ [[TMP68]], [[PRED_LOAD_IF17]] ]
-; FVW2-NEXT:    [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
-; FVW2:       pred.load.if19:
-; FVW2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP72:%.*]] = load float, float* [[TMP71]], align 4
-; FVW2-NEXT:    [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
-; FVW2:       pred.load.continue20:
-; FVW2-NEXT:    [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP73]], [[PRED_LOAD_IF19]] ]
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
+; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0
-; FVW2-NEXT:    store float [[TMP81]], float* [[TMP80]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0
+; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP82]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; FVW2:       pred.store.if21:
-; FVW2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1
-; FVW2-NEXT:    store float [[TMP84]], float* [[TMP83]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; FVW2:       pred.store.continue22:
-; FVW2-NEXT:    [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP85]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; FVW2:       pred.store.if23:
-; FVW2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0
-; FVW2-NEXT:    store float [[TMP87]], float* [[TMP86]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; FVW2:       pred.store.continue24:
-; FVW2-NEXT:    [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP88]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; FVW2:       pred.store.if25:
-; FVW2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1
-; FVW2-NEXT:    store float [[TMP90]], float* [[TMP89]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; FVW2:       pred.store.continue26:
-; FVW2-NEXT:    [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; FVW2:       pred.store.if27:
-; FVW2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0
-; FVW2-NEXT:    store float [[TMP93]], float* [[TMP92]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; FVW2:       pred.store.continue28:
-; FVW2-NEXT:    [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
-; FVW2:       pred.store.if29:
-; FVW2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1
-; FVW2-NEXT:    store float [[TMP96]], float* [[TMP95]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
-; FVW2:       pred.store.continue30:
-; FVW2-NEXT:    [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP97]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
-; FVW2:       pred.store.if31:
-; FVW2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0
-; FVW2-NEXT:    store float [[TMP99]], float* [[TMP98]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; FVW2:       pred.store.continue32:
-; FVW2-NEXT:    [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP100]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34]]
-; FVW2:       pred.store.if33:
-; FVW2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1
-; FVW2-NEXT:    store float [[TMP102]], float* [[TMP101]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE34]]
-; FVW2:       pred.store.continue34:
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FVW2-NEXT:    [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; FVW2:       pred.store.if7:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1
+; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1
+; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; FVW2:       pred.store.continue8:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -1205,186 +841,40 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspac
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
-; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
+; FVW2-NEXT:    [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
-; FVW2-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32
-; FVW2-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48
-; FVW2-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64
-; FVW2-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80
-; FVW2-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112
-; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4
-; FVW2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0
-; FVW2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1
-; FVW2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4
-; FVW2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4
-; FVW2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
-; FVW2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4
-; FVW2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4
-; FVW2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4
-; FVW2-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0
-; FVW2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer
-; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer
-; FVW2-NEXT:    [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP37:%.*]] = load float, float addrspace(1)* [[TMP36]], align 4
-; FVW2-NEXT:    [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; FVW2:       pred.load.if8:
-; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP42:%.*]] = load float, float addrspace(1)* [[TMP41]], align 4
-; FVW2-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; FVW2:       pred.load.continue9:
-; FVW2-NEXT:    [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ]
-; FVW2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
-; FVW2:       pred.load.if10:
-; FVW2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP47:%.*]] = load float, float addrspace(1)* [[TMP46]], align 4
-; FVW2-NEXT:    [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
-; FVW2:       pred.load.continue11:
-; FVW2-NEXT:    [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ]
-; FVW2-NEXT:    [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
-; FVW2:       pred.load.if12:
-; FVW2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP52:%.*]] = load float, float addrspace(1)* [[TMP51]], align 4
-; FVW2-NEXT:    [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
-; FVW2:       pred.load.continue13:
-; FVW2-NEXT:    [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ]
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; FVW2:       pred.load.if14:
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float addrspace(1)* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; FVW2:       pred.load.continue15:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; FVW2:       pred.load.if16:
-; FVW2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP62:%.*]] = load float, float addrspace(1)* [[TMP61]], align 4
-; FVW2-NEXT:    [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; FVW2:       pred.load.continue17:
-; FVW2-NEXT:    [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ]
-; FVW2-NEXT:    [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; FVW2:       pred.load.if18:
-; FVW2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP67:%.*]] = load float, float addrspace(1)* [[TMP66]], align 4
-; FVW2-NEXT:    [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; FVW2:       pred.load.continue19:
-; FVW2-NEXT:    [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ]
-; FVW2-NEXT:    [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; FVW2:       pred.load.if20:
-; FVW2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP72:%.*]] = load float, float addrspace(1)* [[TMP71]], align 4
-; FVW2-NEXT:    [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; FVW2:       pred.load.continue21:
-; FVW2-NEXT:    [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ]
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
+; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0
-; FVW2-NEXT:    store float [[TMP81]], float addrspace(1)* [[TMP80]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0
+; FVW2-NEXT:    store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; FVW2:       pred.store.if22:
-; FVW2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1
-; FVW2-NEXT:    store float [[TMP84]], float addrspace(1)* [[TMP83]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; FVW2:       pred.store.continue23:
-; FVW2-NEXT:    [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; FVW2:       pred.store.if24:
-; FVW2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0
-; FVW2-NEXT:    store float [[TMP87]], float addrspace(1)* [[TMP86]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; FVW2:       pred.store.continue25:
-; FVW2-NEXT:    [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; FVW2:       pred.store.if26:
-; FVW2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1
-; FVW2-NEXT:    store float [[TMP90]], float addrspace(1)* [[TMP89]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; FVW2:       pred.store.continue27:
-; FVW2-NEXT:    [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; FVW2:       pred.store.if28:
-; FVW2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0
-; FVW2-NEXT:    store float [[TMP93]], float addrspace(1)* [[TMP92]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; FVW2:       pred.store.continue29:
-; FVW2-NEXT:    [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; FVW2:       pred.store.if30:
-; FVW2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1
-; FVW2-NEXT:    store float [[TMP96]], float addrspace(1)* [[TMP95]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; FVW2:       pred.store.continue31:
-; FVW2-NEXT:    [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; FVW2:       pred.store.if32:
-; FVW2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0
-; FVW2-NEXT:    store float [[TMP99]], float addrspace(1)* [[TMP98]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; FVW2:       pred.store.continue33:
-; FVW2-NEXT:    [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.if34:
-; FVW2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1
-; FVW2-NEXT:    store float [[TMP102]], float addrspace(1)* [[TMP101]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.continue35:
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1
+; FVW2-NEXT:    store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -1582,186 +1072,40 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspa
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
-; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
+; FVW2-NEXT:    [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
-; FVW2-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32
-; FVW2-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48
-; FVW2-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64
-; FVW2-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80
-; FVW2-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112
-; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4
-; FVW2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0
-; FVW2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1
-; FVW2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4
-; FVW2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4
-; FVW2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
-; FVW2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4
-; FVW2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4
-; FVW2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4
-; FVW2-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0
-; FVW2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer
-; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer
-; FVW2-NEXT:    [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP37:%.*]] = load float, float addrspace(1)* [[TMP36]], align 4
-; FVW2-NEXT:    [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; FVW2:       pred.load.if8:
-; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP42:%.*]] = load float, float addrspace(1)* [[TMP41]], align 4
-; FVW2-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; FVW2:       pred.load.continue9:
-; FVW2-NEXT:    [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ]
-; FVW2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
-; FVW2:       pred.load.if10:
-; FVW2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP47:%.*]] = load float, float addrspace(1)* [[TMP46]], align 4
-; FVW2-NEXT:    [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
-; FVW2:       pred.load.continue11:
-; FVW2-NEXT:    [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ]
-; FVW2-NEXT:    [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
-; FVW2:       pred.load.if12:
-; FVW2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP52:%.*]] = load float, float addrspace(1)* [[TMP51]], align 4
-; FVW2-NEXT:    [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
-; FVW2:       pred.load.continue13:
-; FVW2-NEXT:    [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ]
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; FVW2:       pred.load.if14:
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float addrspace(1)* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; FVW2:       pred.load.continue15:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; FVW2:       pred.load.if16:
-; FVW2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP62:%.*]] = load float, float addrspace(1)* [[TMP61]], align 4
-; FVW2-NEXT:    [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; FVW2:       pred.load.continue17:
-; FVW2-NEXT:    [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ]
-; FVW2-NEXT:    [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; FVW2:       pred.load.if18:
-; FVW2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP67:%.*]] = load float, float addrspace(1)* [[TMP66]], align 4
-; FVW2-NEXT:    [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; FVW2:       pred.load.continue19:
-; FVW2-NEXT:    [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ]
-; FVW2-NEXT:    [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; FVW2:       pred.load.if20:
-; FVW2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP72:%.*]] = load float, float addrspace(1)* [[TMP71]], align 4
-; FVW2-NEXT:    [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; FVW2:       pred.load.continue21:
-; FVW2-NEXT:    [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ]
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
+; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0
-; FVW2-NEXT:    store float [[TMP81]], float* [[TMP80]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0
+; FVW2-NEXT:    store float [[TMP12]], float* [[TMP11]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; FVW2:       pred.store.if22:
-; FVW2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1
-; FVW2-NEXT:    store float [[TMP84]], float* [[TMP83]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; FVW2:       pred.store.continue23:
-; FVW2-NEXT:    [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; FVW2:       pred.store.if24:
-; FVW2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0
-; FVW2-NEXT:    store float [[TMP87]], float* [[TMP86]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; FVW2:       pred.store.continue25:
-; FVW2-NEXT:    [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; FVW2:       pred.store.if26:
-; FVW2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1
-; FVW2-NEXT:    store float [[TMP90]], float* [[TMP89]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; FVW2:       pred.store.continue27:
-; FVW2-NEXT:    [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; FVW2:       pred.store.if28:
-; FVW2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0
-; FVW2-NEXT:    store float [[TMP93]], float* [[TMP92]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; FVW2:       pred.store.continue29:
-; FVW2-NEXT:    [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; FVW2:       pred.store.if30:
-; FVW2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1
-; FVW2-NEXT:    store float [[TMP96]], float* [[TMP95]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; FVW2:       pred.store.continue31:
-; FVW2-NEXT:    [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; FVW2:       pred.store.if32:
-; FVW2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0
-; FVW2-NEXT:    store float [[TMP99]], float* [[TMP98]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; FVW2:       pred.store.continue33:
-; FVW2-NEXT:    [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.if34:
-; FVW2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1
-; FVW2-NEXT:    store float [[TMP102]], float* [[TMP101]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.continue35:
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1
+; FVW2-NEXT:    store float [[TMP15]], float* [[TMP14]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;
@@ -1959,186 +1303,40 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspa
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
-; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4
+; FVW2-NEXT:    [[INDEX10:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ]
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX10]], 4
 ; FVW2-NEXT:    [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16
-; FVW2-NEXT:    [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32
-; FVW2-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48
-; FVW2-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64
-; FVW2-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80
-; FVW2-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112
-; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4
-; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4
-; FVW2-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0
-; FVW2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1
-; FVW2-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4
-; FVW2-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4
-; FVW2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
-; FVW2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
-; FVW2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4
-; FVW2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4
-; FVW2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0
-; FVW2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4
-; FVW2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4
-; FVW2-NEXT:    [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0
-; FVW2-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer
-; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer
-; FVW2-NEXT:    [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; FVW2-NEXT:    [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; FVW2:       pred.load.if:
-; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1
-; FVW2-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP36]], align 4
-; FVW2-NEXT:    [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; FVW2:       pred.load.continue:
-; FVW2-NEXT:    [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ]
-; FVW2-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; FVW2:       pred.load.if8:
-; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1
-; FVW2-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4
-; FVW2-NEXT:    [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; FVW2:       pred.load.continue9:
-; FVW2-NEXT:    [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ]
-; FVW2-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
-; FVW2:       pred.load.if10:
-; FVW2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1
-; FVW2-NEXT:    [[TMP47:%.*]] = load float, float* [[TMP46]], align 4
-; FVW2-NEXT:    [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
-; FVW2:       pred.load.continue11:
-; FVW2-NEXT:    [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ]
-; FVW2-NEXT:    [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
-; FVW2:       pred.load.if12:
-; FVW2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1
-; FVW2-NEXT:    [[TMP52:%.*]] = load float, float* [[TMP51]], align 4
-; FVW2-NEXT:    [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
-; FVW2:       pred.load.continue13:
-; FVW2-NEXT:    [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ]
-; FVW2-NEXT:    [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; FVW2:       pred.load.if14:
-; FVW2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1
-; FVW2-NEXT:    [[TMP57:%.*]] = load float, float* [[TMP56]], align 4
-; FVW2-NEXT:    [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; FVW2:       pred.load.continue15:
-; FVW2-NEXT:    [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ]
-; FVW2-NEXT:    [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; FVW2:       pred.load.if16:
-; FVW2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1
-; FVW2-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP61]], align 4
-; FVW2-NEXT:    [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; FVW2:       pred.load.continue17:
-; FVW2-NEXT:    [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ]
-; FVW2-NEXT:    [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; FVW2:       pred.load.if18:
-; FVW2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1
-; FVW2-NEXT:    [[TMP67:%.*]] = load float, float* [[TMP66]], align 4
-; FVW2-NEXT:    [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; FVW2:       pred.load.continue19:
-; FVW2-NEXT:    [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ]
-; FVW2-NEXT:    [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; FVW2:       pred.load.if20:
-; FVW2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1
-; FVW2-NEXT:    [[TMP72:%.*]] = load float, float* [[TMP71]], align 4
-; FVW2-NEXT:    [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1
-; FVW2-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; FVW2:       pred.load.continue21:
-; FVW2-NEXT:    [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ]
-; FVW2-NEXT:    [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], <float 5.000000e-01, float 5.000000e-01>
-; FVW2-NEXT:    [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0
-; FVW2-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; FVW2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0
+; FVW2-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; FVW2:       pred.store.if:
-; FVW2-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
-; FVW2-NEXT:    [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0
-; FVW2-NEXT:    store float [[TMP81]], float addrspace(1)* [[TMP80]], align 4
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0
+; FVW2-NEXT:    store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; FVW2:       pred.store.continue:
-; FVW2-NEXT:    [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1
-; FVW2-NEXT:    br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; FVW2:       pred.store.if22:
-; FVW2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
-; FVW2-NEXT:    [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1
-; FVW2-NEXT:    store float [[TMP84]], float addrspace(1)* [[TMP83]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; FVW2:       pred.store.continue23:
-; FVW2-NEXT:    [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0
-; FVW2-NEXT:    br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; FVW2:       pred.store.if24:
-; FVW2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]]
-; FVW2-NEXT:    [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0
-; FVW2-NEXT:    store float [[TMP87]], float addrspace(1)* [[TMP86]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; FVW2:       pred.store.continue25:
-; FVW2-NEXT:    [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1
-; FVW2-NEXT:    br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; FVW2:       pred.store.if26:
-; FVW2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP2]]
-; FVW2-NEXT:    [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1
-; FVW2-NEXT:    store float [[TMP90]], float addrspace(1)* [[TMP89]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; FVW2:       pred.store.continue27:
-; FVW2-NEXT:    [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0
-; FVW2-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; FVW2:       pred.store.if28:
-; FVW2-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP3]]
-; FVW2-NEXT:    [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0
-; FVW2-NEXT:    store float [[TMP93]], float addrspace(1)* [[TMP92]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; FVW2:       pred.store.continue29:
-; FVW2-NEXT:    [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1
-; FVW2-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; FVW2:       pred.store.if30:
-; FVW2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP4]]
-; FVW2-NEXT:    [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1
-; FVW2-NEXT:    store float [[TMP96]], float addrspace(1)* [[TMP95]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; FVW2:       pred.store.continue31:
-; FVW2-NEXT:    [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0
-; FVW2-NEXT:    br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; FVW2:       pred.store.if32:
-; FVW2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP5]]
-; FVW2-NEXT:    [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0
-; FVW2-NEXT:    store float [[TMP99]], float addrspace(1)* [[TMP98]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; FVW2:       pred.store.continue33:
-; FVW2-NEXT:    [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1
-; FVW2-NEXT:    br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.if34:
-; FVW2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1
-; FVW2-NEXT:    store float [[TMP102]], float addrspace(1)* [[TMP101]], align 4
-; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; FVW2:       pred.store.continue35:
-; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8
-; FVW2-NEXT:    [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; FVW2-NEXT:    br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1
+; FVW2-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.if8:
+; FVW2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]]
+; FVW2-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1
+; FVW2-NEXT:    store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; FVW2:       pred.store.continue9:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX10]], 2
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index b09da953cc79..d91cd28c0bb4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1103,8 +1103,8 @@ define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* no
 ; DISABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DISABLED_MASKED_STRIDED:       vector.body:
-; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE44:%.*]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE44]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
@@ -1277,115 +1277,148 @@ define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* no
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP104]], i8* [[TMP103]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP105:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP106:%.*]] = sub i8 0, [[TMP105]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP106]], i8* [[TMP108]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP109]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP105:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if31:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP112]], i8* [[TMP111]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = sub i8 0, [[TMP113]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP114]], i8* [[TMP116]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP106:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP108]], i8* [[TMP107]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE32]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue32:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP117]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if33:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP120]], i8* [[TMP119]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = sub i8 0, [[TMP121]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP122]], i8* [[TMP124]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP112]], i8* [[TMP111]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE34]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue34:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP125]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if35:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP128]], i8* [[TMP127]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = sub i8 0, [[TMP129]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP130]], i8* [[TMP132]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP116]], i8* [[TMP115]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE36]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue36:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP133]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if37:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP134]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP136]], i8* [[TMP135]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = sub i8 0, [[TMP137]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP138]], i8* [[TMP140]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP120]], i8* [[TMP119]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE38]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue38:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP141]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if39:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP142]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP144]], i8* [[TMP143]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = sub i8 0, [[TMP145]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP146]], i8* [[TMP148]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP124]], i8* [[TMP123]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE40]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue40:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP149]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if41:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP150]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP152]], i8* [[TMP151]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = sub i8 0, [[TMP153]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP154]], i8* [[TMP156]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP128]], i8* [[TMP127]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE42]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue42:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP157]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if43:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP158]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP160]], i8* [[TMP159]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = sub i8 0, [[TMP161]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP162]], i8* [[TMP164]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP132]], i8* [[TMP131]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE44]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue44:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if45:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP137]], i8* [[TMP136]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE46]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue46:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if47:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP141]], i8* [[TMP140]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE48]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue48:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if49:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP145]], i8* [[TMP144]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE50]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue50:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if51:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP149]], i8* [[TMP148]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE52]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue52:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if53:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP153]], i8* [[TMP152]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE54]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue54:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if55:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP157]], i8* [[TMP156]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE56]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue56:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if57:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP161]], i8* [[TMP160]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE58]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue58:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]]
+; DISABLED_MASKED_STRIDED:       pred.store.if59:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP165]], i8* [[TMP164]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue60:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP165]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP166]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;
@@ -1490,8 +1523,8 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DISABLED_MASKED_STRIDED:       vector.body:
-; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE49:%.*]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 1024, i32 1023, i32 1022, i32 1021, i32 1020, i32 1019, i32 1018, i32 1017>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE49]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 1024, i32 1023, i32 1022, i32 1021, i32 1020, i32 1019, i32 1018, i32 1017>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
@@ -1664,115 +1697,148 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP109]], i8* [[TMP108]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = sub i8 0, [[TMP110]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP111]], i8* [[TMP113]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP114]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP110]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if36:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP117]], i8* [[TMP116]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = sub i8 0, [[TMP118]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP119]], i8* [[TMP121]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP111]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP113]], i8* [[TMP112]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE37]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue37:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP122]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP114]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if38:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP125]], i8* [[TMP124]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = sub i8 0, [[TMP126]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP127]], i8* [[TMP129]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP117]], i8* [[TMP116]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE39]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue39:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP130]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP118]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if40:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP133]], i8* [[TMP132]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = sub i8 0, [[TMP134]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP135]], i8* [[TMP137]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP119]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP121]], i8* [[TMP120]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE41]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue41:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP138]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP122]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if42:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP141]], i8* [[TMP140]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = sub i8 0, [[TMP142]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP143]], i8* [[TMP145]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP125]], i8* [[TMP124]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE43]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue43:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP146]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP126]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if44:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP149]], i8* [[TMP148]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = sub i8 0, [[TMP150]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP151]], i8* [[TMP153]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP127]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP129]], i8* [[TMP128]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE45]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue45:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP154]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP130]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if46:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP157]], i8* [[TMP156]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = sub i8 0, [[TMP158]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP159]], i8* [[TMP161]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP133]], i8* [[TMP132]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE47]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue47:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP162]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP134]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if48:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP165]], i8* [[TMP164]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = sub i8 0, [[TMP166]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP167]], i8* [[TMP169]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP137]], i8* [[TMP136]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE49]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue49:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = sub <8 x i8> zeroinitializer, [[TMP105]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP139]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if50:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP140]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i8> [[TMP138]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP142]], i8* [[TMP141]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE51]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue51:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP143]], label [[PRED_STORE_IF52:%.*]], label [[PRED_STORE_CONTINUE53:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if52:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i8> [[TMP138]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP146]], i8* [[TMP145]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE53]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue53:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP147]], label [[PRED_STORE_IF54:%.*]], label [[PRED_STORE_CONTINUE55:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if54:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP148]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i8> [[TMP138]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP150]], i8* [[TMP149]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE55]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue55:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP151]], label [[PRED_STORE_IF56:%.*]], label [[PRED_STORE_CONTINUE57:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if56:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i8> [[TMP138]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP154]], i8* [[TMP153]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE57]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue57:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP155]], label [[PRED_STORE_IF58:%.*]], label [[PRED_STORE_CONTINUE59:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if58:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP156]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i8> [[TMP138]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP158]], i8* [[TMP157]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE59]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue59:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP159]], label [[PRED_STORE_IF60:%.*]], label [[PRED_STORE_CONTINUE61:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if60:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i8> [[TMP138]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP162]], i8* [[TMP161]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE61]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue61:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP163]], label [[PRED_STORE_IF62:%.*]], label [[PRED_STORE_CONTINUE63:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if62:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP164]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = extractelement <8 x i8> [[TMP138]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP166]], i8* [[TMP165]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE63]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue63:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP167]], label [[PRED_STORE_IF64:%.*]], label [[PRED_STORE_CONTINUE65]]
+; DISABLED_MASKED_STRIDED:       pred.store.if64:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP170:%.*]] = extractelement <8 x i8> [[TMP138]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP170]], i8* [[TMP169]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE65]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue65:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP170:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP170]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP171:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP171]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.body:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[IX_024:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[ENTRY:%.*]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]]
@@ -1780,12 +1846,12 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED:       if.then:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[MUL]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP171:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[ADD:%.*]] = or i32 [[MUL]], 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[ADD]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[CMP_I:%.*]] = icmp slt i8 [[TMP171]], [[TMP172]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP172]], i8 [[TMP171]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP173:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[CMP_I:%.*]] = icmp slt i8 [[TMP172]], [[TMP173]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP173]], i8 [[TMP172]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[MUL]]
 ; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[SPEC_SELECT_I]], i8* [[ARRAYIDX6]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]]
@@ -1815,8 +1881,8 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE49:%.*]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 1024, i32 1023, i32 1022, i32 1021, i32 1020, i32 1019, i32 1018, i32 1017>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE49]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 1024, i32 1023, i32 1022, i32 1021, i32 1020, i32 1019, i32 1018, i32 1017>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
@@ -1989,115 +2055,148 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0
 ; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP109]], i8* [[TMP108]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = sub i8 0, [[TMP110]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP111]], i8* [[TMP113]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP114]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP110]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if36:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP117]], i8* [[TMP116]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = sub i8 0, [[TMP118]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP119]], i8* [[TMP121]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP111]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP113]], i8* [[TMP112]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE37]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue37:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP122]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP114]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if38:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP125]], i8* [[TMP124]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = sub i8 0, [[TMP126]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP127]], i8* [[TMP129]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP117]], i8* [[TMP116]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE39]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue39:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP130]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP118]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if40:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP133]], i8* [[TMP132]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = sub i8 0, [[TMP134]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP135]], i8* [[TMP137]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP119]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP121]], i8* [[TMP120]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE41]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue41:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP138]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP122]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if42:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP141]], i8* [[TMP140]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = sub i8 0, [[TMP142]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP143]], i8* [[TMP145]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP125]], i8* [[TMP124]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE43]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue43:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP146]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP126]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if44:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP149]], i8* [[TMP148]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = sub i8 0, [[TMP150]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP151]], i8* [[TMP153]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP127]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP129]], i8* [[TMP128]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE45]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue45:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP154]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP130]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if46:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP157]], i8* [[TMP156]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = sub i8 0, [[TMP158]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP159]], i8* [[TMP161]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP133]], i8* [[TMP132]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE47]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue47:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP162]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP134]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]]
 ; ENABLED_MASKED_STRIDED:       pred.store.if48:
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP165]], i8* [[TMP164]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = sub i8 0, [[TMP166]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]]
-; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP167]], i8* [[TMP169]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP137]], i8* [[TMP136]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE49]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue49:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = sub <8 x i8> zeroinitializer, [[TMP105]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP139]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if50:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP140]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i8> [[TMP138]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP142]], i8* [[TMP141]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE51]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue51:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP143]], label [[PRED_STORE_IF52:%.*]], label [[PRED_STORE_CONTINUE53:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if52:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i8> [[TMP138]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP146]], i8* [[TMP145]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE53]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue53:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP147]], label [[PRED_STORE_IF54:%.*]], label [[PRED_STORE_CONTINUE55:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if54:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP148]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i8> [[TMP138]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP150]], i8* [[TMP149]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE55]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue55:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP151]], label [[PRED_STORE_IF56:%.*]], label [[PRED_STORE_CONTINUE57:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if56:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i8> [[TMP138]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP154]], i8* [[TMP153]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE57]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue57:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP155]], label [[PRED_STORE_IF58:%.*]], label [[PRED_STORE_CONTINUE59:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if58:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP156]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i8> [[TMP138]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP158]], i8* [[TMP157]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE59]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue59:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP159]], label [[PRED_STORE_IF60:%.*]], label [[PRED_STORE_CONTINUE61:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if60:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i8> [[TMP138]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP162]], i8* [[TMP161]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE61]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue61:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP163]], label [[PRED_STORE_IF62:%.*]], label [[PRED_STORE_CONTINUE63:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.store.if62:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP164]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = extractelement <8 x i8> [[TMP138]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP166]], i8* [[TMP165]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE63]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue63:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP167]], label [[PRED_STORE_IF64:%.*]], label [[PRED_STORE_CONTINUE65]]
+; ENABLED_MASKED_STRIDED:       pred.store.if64:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP170:%.*]] = extractelement <8 x i8> [[TMP138]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP170]], i8* [[TMP169]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE65]]
+; ENABLED_MASKED_STRIDED:       pred.store.continue65:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8, i32 -8>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP170:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP170]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP171:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP171]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[IX_024:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[ENTRY:%.*]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]]
@@ -2105,12 +2204,12 @@ define dso_local void @masked_strided2_reverse(i8* noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED:       if.then:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[MUL]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP171:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[ADD:%.*]] = or i32 [[MUL]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[ADD]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
-; ENABLED_MASKED_STRIDED-NEXT:    [[CMP_I:%.*]] = icmp slt i8 [[TMP171]], [[TMP172]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP172]], i8 [[TMP171]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP173:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP_I:%.*]] = icmp slt i8 [[TMP172]], [[TMP173]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP173]], i8 [[TMP172]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[MUL]]
 ; ENABLED_MASKED_STRIDED-NEXT:    store i8 [[SPEC_SELECT_I]], i8* [[ARRAYIDX6]], align 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]]
@@ -2199,8 +2298,8 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DISABLED_MASKED_STRIDED:       vector.body:
-; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE46:%.*]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE46]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE62:%.*]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE62]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -2375,115 +2474,148 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP104]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP106:%.*]] = extractelement <8 x i8> [[TMP102]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP106]], i8* [[TMP105]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = extractelement <8 x i8> [[TMP102]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = sub i8 0, [[TMP107]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP109]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP108]], i8* [[TMP110]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP111]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP107]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if33:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP114]], i8* [[TMP113]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = sub i8 0, [[TMP115]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP117]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP116]], i8* [[TMP118]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP108]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP110]], i8* [[TMP109]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE34]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue34:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP119]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP111]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if35:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP122]], i8* [[TMP121]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = sub i8 0, [[TMP123]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP125]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP124]], i8* [[TMP126]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP114]], i8* [[TMP113]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE36]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue36:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP127]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP115]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if37:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP130]], i8* [[TMP129]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = sub i8 0, [[TMP131]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP133]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP132]], i8* [[TMP134]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP116]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP118]], i8* [[TMP117]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE38]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue38:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP135]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP119]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if39:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP138]], i8* [[TMP137]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = sub i8 0, [[TMP139]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP141]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP140]], i8* [[TMP142]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP122]], i8* [[TMP121]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE40]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue40:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP143]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP123]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if41:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP146]], i8* [[TMP145]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = sub i8 0, [[TMP147]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP149]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP148]], i8* [[TMP150]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP124]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP126]], i8* [[TMP125]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE42]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue42:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP151]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP127]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if43:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP154]], i8* [[TMP153]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = sub i8 0, [[TMP155]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP157]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP156]], i8* [[TMP158]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP130]], i8* [[TMP129]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE44]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue44:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP159]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP131]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if45:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP162]], i8* [[TMP161]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = sub i8 0, [[TMP163]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i32> [[TMP52]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP165]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP164]], i8* [[TMP166]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP132]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP134]], i8* [[TMP133]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE46]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue46:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = sub <8 x i8> zeroinitializer, [[TMP102]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP136]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if47:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP137]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i8> [[TMP135]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP139]], i8* [[TMP138]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE48]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue48:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP140]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if49:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP141]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i8> [[TMP135]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP143]], i8* [[TMP142]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE50]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue50:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP144]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if51:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP145]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i8> [[TMP135]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP147]], i8* [[TMP146]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE52]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue52:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP148]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if53:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP149]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i8> [[TMP135]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP151]], i8* [[TMP150]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE54]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue54:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP152]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if55:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP153]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i8> [[TMP135]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP155]], i8* [[TMP154]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE56]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue56:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP156]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if57:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP157]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i8> [[TMP135]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP159]], i8* [[TMP158]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE58]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue58:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP160]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if59:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP161]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i8> [[TMP135]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP163]], i8* [[TMP162]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue60:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP164]], label [[PRED_STORE_IF61:%.*]], label [[PRED_STORE_CONTINUE62]]
+; DISABLED_MASKED_STRIDED:       pred.store.if61:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i32> [[TMP52]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP165]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = extractelement <8 x i8> [[TMP135]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP167]], i8* [[TMP166]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE62]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue62:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP167:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP167]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP168:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP168]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;
@@ -2608,8 +2740,8 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias noca
 ; DISABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DISABLED_MASKED_STRIDED:       vector.body:
-; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE44:%.*]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE44]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ]
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
@@ -2782,115 +2914,148 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias noca
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP104]], i8* [[TMP103]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP105:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP106:%.*]] = sub i8 0, [[TMP105]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP106]], i8* [[TMP108]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP109]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP105:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if31:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP112]], i8* [[TMP111]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = sub i8 0, [[TMP113]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP114]], i8* [[TMP116]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP106:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP108]], i8* [[TMP107]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE32]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue32:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP117]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if33:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP120]], i8* [[TMP119]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = sub i8 0, [[TMP121]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP122]], i8* [[TMP124]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP112]], i8* [[TMP111]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE34]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue34:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP125]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP113:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if35:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP128]], i8* [[TMP127]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = sub i8 0, [[TMP129]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP130]], i8* [[TMP132]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP114:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP116]], i8* [[TMP115]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE36]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue36:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP133]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if37:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP134]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP136]], i8* [[TMP135]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = sub i8 0, [[TMP137]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP138]], i8* [[TMP140]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP120]], i8* [[TMP119]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE38]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue38:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP141]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP121:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if39:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP142]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP144]], i8* [[TMP143]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = sub i8 0, [[TMP145]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP146]], i8* [[TMP148]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP122:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP124]], i8* [[TMP123]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE40]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue40:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP149]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if41:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP150]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP152]], i8* [[TMP151]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = sub i8 0, [[TMP153]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP154]], i8* [[TMP156]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP128]], i8* [[TMP127]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE42]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue42:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP157]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP129:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.store.if43:
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP158]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP160]], i8* [[TMP159]], align 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = sub i8 0, [[TMP161]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
-; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP162]], i8* [[TMP164]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP130:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP132]], i8* [[TMP131]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE44]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue44:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP134:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if45:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP137]], i8* [[TMP136]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE46]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue46:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP138:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if47:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP141]], i8* [[TMP140]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE48]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue48:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP142:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if49:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP145]], i8* [[TMP144]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE50]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue50:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP146:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if51:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP149]], i8* [[TMP148]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE52]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue52:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP150:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if53:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i64 4
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP153]], i8* [[TMP152]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE54]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue54:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP154:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if55:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i64 5
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP157]], i8* [[TMP156]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE56]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue56:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP158:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.store.if57:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i64 6
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP161]], i8* [[TMP160]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE58]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue58:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP162:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]]
+; DISABLED_MASKED_STRIDED:       pred.store.if59:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    store i8 [[TMP165]], i8* [[TMP164]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
+; DISABLED_MASKED_STRIDED:       pred.store.continue60:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP165]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP166]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
 ; DISABLED_MASKED_STRIDED-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 78bcb52c9d5e..65d24b08df99 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=UNROLL
-; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
-; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=VEC
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=UNROLL
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=VEC
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 

diff  --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
index 888f09fdb68e..8e007a63ba40 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -67,7 +67,7 @@ define void @maxvf3() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -82,7 +82,7 @@ define void @maxvf3() {
 ; CHECK-NEXT:    store i8 7, i8* [[AJP3]], align 8
 ; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i32 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 65c624de842b..67b8b84e0305 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -1,122 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -S | FileCheck %s -check-prefixes=DEFAULT,PGSO
-; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso -S | FileCheck %s -check-prefixes=DEFAULT,PGSO
-; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso=false -S | FileCheck %s -check-prefixes=DEFAULT,NPGSO
-; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -S | FileCheck %s -check-prefixes=DEFAULT,PGSO
-; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -pgso -S | FileCheck %s -check-prefixes=DEFAULT,PGSO
-; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -pgso=false -S | FileCheck %s -check-prefixes=DEFAULT,NPGSO
-
-; REQUIRES: asserts
-
 ; This test verifies that the loop vectorizer will NOT produce a tail
 ; loop with the optimize for size or the minimize size attributes.
+; REQUIRES: asserts
+; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO
+; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -S | FileCheck %s
+; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -passes='require<profile-summary>,loop-vectorize' -pgso=false -S | FileCheck %s -check-prefix=NPGSO
 
 target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
 
 @tab = common global [32 x i8] zeroinitializer, align 1
 
 define i32 @foo_optsize() #0 {
-; DEFAULT-LABEL: @foo_optsize(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], <i32 202, i32 202, i32 202, i32 202>
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; DEFAULT:       pred.load.if:
-; DEFAULT-NEXT:    [[TMP10:%.*]] = load i8, i8* [[TMP5]], align 1
-; DEFAULT-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; DEFAULT:       pred.load.continue:
-; DEFAULT-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; DEFAULT:       pred.load.if1:
-; DEFAULT-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP6]], align 1
-; DEFAULT-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[TMP14]], i32 1
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; DEFAULT:       pred.load.continue2:
-; DEFAULT-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], [[PRED_LOAD_IF1]] ]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; DEFAULT:       pred.load.if3:
-; DEFAULT-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP7]], align 1
-; DEFAULT-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP18]], i32 2
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; DEFAULT:       pred.load.continue4:
-; DEFAULT-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP16]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; DEFAULT:       pred.load.if5:
-; DEFAULT-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP8]], align 1
-; DEFAULT-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP22]], i32 3
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; DEFAULT:       pred.load.continue6:
-; DEFAULT-NEXT:    [[TMP24:%.*]] = phi <4 x i8> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq <4 x i8> [[TMP24]], zeroinitializer
-; DEFAULT-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP25]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; DEFAULT:       pred.store.if:
-; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
-; DEFAULT-NEXT:    store i8 [[TMP28]], i8* [[TMP5]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; DEFAULT:       pred.store.continue:
-; DEFAULT-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; DEFAULT:       pred.store.if7:
-; DEFAULT-NEXT:    [[TMP30:%.*]] = extractelement <4 x i8> [[TMP26]], i32 1
-; DEFAULT-NEXT:    store i8 [[TMP30]], i8* [[TMP6]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; DEFAULT:       pred.store.continue8:
-; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; DEFAULT:       pred.store.if9:
-; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP26]], i32 2
-; DEFAULT-NEXT:    store i8 [[TMP32]], i8* [[TMP7]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; DEFAULT:       pred.store.continue10:
-; DEFAULT-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; DEFAULT:       pred.store.if11:
-; DEFAULT-NEXT:    [[TMP34:%.*]] = extractelement <4 x i8> [[TMP26]], i32 3
-; DEFAULT-NEXT:    store i8 [[TMP34]], i8* [[TMP8]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; DEFAULT:       pred.store.continue12:
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; DEFAULT-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 204
-; DEFAULT-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 204, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; DEFAULT-NEXT:    [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP36]], 0
-; DEFAULT-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; DEFAULT-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; DEFAULT:       for.end:
-; DEFAULT-NEXT:    ret i32 0
-;
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+
 entry:
   br label %for.body
 
@@ -138,107 +38,11 @@ for.end:                                          ; preds = %for.body
 attributes #0 = { optsize }
 
 define i32 @foo_minsize() #1 {
-; DEFAULT-LABEL: @foo_minsize(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], <i32 202, i32 202, i32 202, i32 202>
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; DEFAULT:       pred.load.if:
-; DEFAULT-NEXT:    [[TMP10:%.*]] = load i8, i8* [[TMP5]], align 1
-; DEFAULT-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; DEFAULT:       pred.load.continue:
-; DEFAULT-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; DEFAULT:       pred.load.if1:
-; DEFAULT-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP6]], align 1
-; DEFAULT-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[TMP14]], i32 1
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; DEFAULT:       pred.load.continue2:
-; DEFAULT-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], [[PRED_LOAD_IF1]] ]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; DEFAULT:       pred.load.if3:
-; DEFAULT-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP7]], align 1
-; DEFAULT-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP18]], i32 2
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; DEFAULT:       pred.load.continue4:
-; DEFAULT-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP16]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; DEFAULT:       pred.load.if5:
-; DEFAULT-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP8]], align 1
-; DEFAULT-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP22]], i32 3
-; DEFAULT-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; DEFAULT:       pred.load.continue6:
-; DEFAULT-NEXT:    [[TMP24:%.*]] = phi <4 x i8> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq <4 x i8> [[TMP24]], zeroinitializer
-; DEFAULT-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP25]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; DEFAULT:       pred.store.if:
-; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
-; DEFAULT-NEXT:    store i8 [[TMP28]], i8* [[TMP5]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; DEFAULT:       pred.store.continue:
-; DEFAULT-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; DEFAULT:       pred.store.if7:
-; DEFAULT-NEXT:    [[TMP30:%.*]] = extractelement <4 x i8> [[TMP26]], i32 1
-; DEFAULT-NEXT:    store i8 [[TMP30]], i8* [[TMP6]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; DEFAULT:       pred.store.continue8:
-; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; DEFAULT:       pred.store.if9:
-; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP26]], i32 2
-; DEFAULT-NEXT:    store i8 [[TMP32]], i8* [[TMP7]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; DEFAULT:       pred.store.continue10:
-; DEFAULT-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; DEFAULT:       pred.store.if11:
-; DEFAULT-NEXT:    [[TMP34:%.*]] = extractelement <4 x i8> [[TMP26]], i32 3
-; DEFAULT-NEXT:    store i8 [[TMP34]], i8* [[TMP8]], align 1
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; DEFAULT:       pred.store.continue12:
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; DEFAULT-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 204
-; DEFAULT-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 204, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; DEFAULT-NEXT:    [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP36]], 0
-; DEFAULT-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; DEFAULT-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; DEFAULT:       for.end:
-; DEFAULT-NEXT:    ret i32 0
-;
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+; CHECK-LABEL: @foo_pgso(
+
 entry:
   br label %for.body
 
@@ -261,144 +65,10 @@ attributes #1 = { minsize }
 
 define i32 @foo_pgso() !prof !14 {
 ; PGSO-LABEL: @foo_pgso(
-; PGSO-NEXT:  entry:
-; PGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PGSO:       vector.ph:
-; PGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PGSO:       vector.body:
-; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; PGSO-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; PGSO-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; PGSO-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; PGSO-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], <i32 202, i32 202, i32 202, i32 202>
-; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; PGSO-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; PGSO-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; PGSO-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; PGSO-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; PGSO:       pred.load.if:
-; PGSO-NEXT:    [[TMP10:%.*]] = load i8, i8* [[TMP5]], align 1
-; PGSO-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0
-; PGSO-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; PGSO:       pred.load.continue:
-; PGSO-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; PGSO-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; PGSO-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; PGSO:       pred.load.if1:
-; PGSO-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP6]], align 1
-; PGSO-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[TMP14]], i32 1
-; PGSO-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; PGSO:       pred.load.continue2:
-; PGSO-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], [[PRED_LOAD_IF1]] ]
-; PGSO-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; PGSO-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; PGSO:       pred.load.if3:
-; PGSO-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP7]], align 1
-; PGSO-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP18]], i32 2
-; PGSO-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; PGSO:       pred.load.continue4:
-; PGSO-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP16]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; PGSO-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; PGSO-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; PGSO:       pred.load.if5:
-; PGSO-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP8]], align 1
-; PGSO-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP22]], i32 3
-; PGSO-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; PGSO:       pred.load.continue6:
-; PGSO-NEXT:    [[TMP24:%.*]] = phi <4 x i8> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; PGSO-NEXT:    [[TMP25:%.*]] = icmp eq <4 x i8> [[TMP24]], zeroinitializer
-; PGSO-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP25]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; PGSO-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; PGSO-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PGSO:       pred.store.if:
-; PGSO-NEXT:    [[TMP28:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
-; PGSO-NEXT:    store i8 [[TMP28]], i8* [[TMP5]], align 1
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PGSO:       pred.store.continue:
-; PGSO-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; PGSO-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; PGSO:       pred.store.if7:
-; PGSO-NEXT:    [[TMP30:%.*]] = extractelement <4 x i8> [[TMP26]], i32 1
-; PGSO-NEXT:    store i8 [[TMP30]], i8* [[TMP6]], align 1
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; PGSO:       pred.store.continue8:
-; PGSO-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; PGSO-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; PGSO:       pred.store.if9:
-; PGSO-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP26]], i32 2
-; PGSO-NEXT:    store i8 [[TMP32]], i8* [[TMP7]], align 1
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; PGSO:       pred.store.continue10:
-; PGSO-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; PGSO-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; PGSO:       pred.store.if11:
-; PGSO-NEXT:    [[TMP34:%.*]] = extractelement <4 x i8> [[TMP26]], i32 3
-; PGSO-NEXT:    store i8 [[TMP34]], i8* [[TMP8]], align 1
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; PGSO:       pred.store.continue12:
-; PGSO-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; PGSO-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 204
-; PGSO-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; PGSO:       middle.block:
-; PGSO-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PGSO:       scalar.ph:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 204, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; PGSO:       for.body:
-; PGSO-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; PGSO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; PGSO-NEXT:    [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; PGSO-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP36]], 0
-; PGSO-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; PGSO-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; PGSO-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; PGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; PGSO:       for.end:
-; PGSO-NEXT:    ret i32 0
-;
+; PGSO-NOT: <{{[0-9]+}} x i8>
 ; NPGSO-LABEL: @foo_pgso(
-; NPGSO-NEXT:  entry:
-; NPGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NPGSO:       vector.ph:
-; NPGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NPGSO:       vector.body:
-; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; NPGSO-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; NPGSO-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; NPGSO-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; NPGSO-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; NPGSO-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; NPGSO-NEXT:    store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; NPGSO-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 200
-; NPGSO-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; NPGSO:       middle.block:
-; NPGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 203, 200
-; NPGSO-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; NPGSO:       scalar.ph:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; NPGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; NPGSO:       for.body:
-; NPGSO-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; NPGSO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; NPGSO-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; NPGSO-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; NPGSO-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; NPGSO-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; NPGSO-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; NPGSO:       for.end:
-; NPGSO-NEXT:    ret i32 0
-;
+; NPGSO: <{{[0-9]+}} x i8>
+
 entry:
   br label %for.body
 
@@ -423,43 +93,19 @@ for.end:                                          ; preds = %for.body
 @cm_array = external global [2592 x i16], align 1
 
 define void @pr43371() optsize {
-; DEFAULT-LABEL: @pr43371(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
-; DEFAULT-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; DEFAULT-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP4]]
-; DEFAULT-NEXT:    store i16 0, i16* [[TMP3]], align 1
-; DEFAULT-NEXT:    store i16 0, i16* [[TMP5]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], <i16 2, i16 2>
-; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
-; DEFAULT-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i32 756, 756
-; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP28:%.*]], label [[SCALAR_PH]]
-; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    br label [[FOR_BODY29:%.*]]
-; DEFAULT:       for.cond.cleanup28:
-; DEFAULT-NEXT:    unreachable
-; DEFAULT:       for.body29:
-; DEFAULT-NEXT:    [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ]
-; DEFAULT-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; DEFAULT-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; DEFAULT-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]]
-; DEFAULT-NEXT:    store i16 0, i16* [[ARRAYIDX35]], align 1
-; DEFAULT-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; DEFAULT-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; DEFAULT-NEXT:    br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28]], !llvm.loop [[LOOP24:![0-9]+]]
+;
+; CHECK-LABEL: @pr43371
+; CHECK-NOT:   vector.scevcheck
+;
+; We do not want to generate SCEV predicates when optimising for size, because
+; that will lead to extra code generation such as the SCEV overflow runtime
+; checks. Not generating SCEV predicates can still result in vectorisation as
+; the non-consecutive loads/stores can be scalarized:
+;
+; CHECK: vector.body:
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: br i1 {{.*}}, label %vector.body
 ;
 entry:
   br label %for.body29
@@ -479,81 +125,19 @@ for.body29:
 }
 
 define void @pr43371_pgso() !prof !14 {
-; PGSO-LABEL: @pr43371_pgso(
-; PGSO-NEXT:  entry:
-; PGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PGSO:       vector.ph:
-; PGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PGSO:       vector.body:
-; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
-; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP2]]
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, i16* [[TMP3]], align 1
-; PGSO-NEXT:    store i16 0, i16* [[TMP5]], align 1
-; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], <i16 2, i16 2>
-; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
-; PGSO-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; PGSO:       middle.block:
-; PGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 756, 756
-; PGSO-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP28:%.*]], label [[SCALAR_PH]]
-; PGSO:       scalar.ph:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PGSO-NEXT:    br label [[FOR_BODY29:%.*]]
-; PGSO:       for.cond.cleanup28:
-; PGSO-NEXT:    unreachable
-; PGSO:       for.body29:
-; PGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ]
-; PGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; PGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; PGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]]
-; PGSO-NEXT:    store i16 0, i16* [[ARRAYIDX35]], align 1
-; PGSO-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; PGSO-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; PGSO-NEXT:    br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28]], !llvm.loop [[LOOP26:![0-9]+]]
 ;
-; NPGSO-LABEL: @pr43371_pgso(
-; NPGSO-NEXT:  entry:
-; NPGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; NPGSO:       vector.scevcheck:
-; NPGSO-NEXT:    br i1 undef, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; NPGSO:       vector.ph:
-; NPGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NPGSO:       vector.body:
-; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; NPGSO-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 undef, [[TMP0]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP2]]
-; NPGSO-NEXT:    [[TMP4:%.*]] = getelementptr i16, i16* [[TMP3]], i32 0
-; NPGSO-NEXT:    [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <2 x i16>*
-; NPGSO-NEXT:    store <2 x i16> zeroinitializer, <2 x i16>* [[TMP5]], align 1
-; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
-; NPGSO-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; NPGSO:       middle.block:
-; NPGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 756, 756
-; NPGSO-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP28:%.*]], label [[SCALAR_PH]]
-; NPGSO:       scalar.ph:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; NPGSO-NEXT:    br label [[FOR_BODY29:%.*]]
-; NPGSO:       for.cond.cleanup28:
-; NPGSO-NEXT:    unreachable
-; NPGSO:       for.body29:
-; NPGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ]
-; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; NPGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; NPGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]]
-; NPGSO-NEXT:    store i16 0, i16* [[ARRAYIDX35]], align 1
-; NPGSO-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; NPGSO-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; NPGSO-NEXT:    br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-LABEL: @pr43371_pgso
+; CHECK-NOT:   vector.scevcheck
+;
+; We do not want to generate SCEV predicates when optimising for size, because
+; that will lead to extra code generation such as the SCEV overflow runtime
+; checks. Not generating SCEV predicates can still result in vectorisation as
+; the non-consecutive loads/stores can be scalarized:
+;
+; CHECK: vector.body:
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: br i1 {{.*}}, label %vector.body
 ;
 entry:
   br label %for.body29
@@ -575,18 +159,21 @@ for.body29:
 ; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out.
 ;
 define i32 @pr45526() optsize {
-; DEFAULT-LABEL: @pr45526(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br label [[LOOP:%.*]]
-; DEFAULT:       loop:
-; DEFAULT-NEXT:    [[PIV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ]
-; DEFAULT-NEXT:    [[FOR:%.*]] = phi i32 [ 5, [[ENTRY]] ], [ [[PIVPLUS1]], [[LOOP]] ]
-; DEFAULT-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; DEFAULT-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; DEFAULT-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
-; DEFAULT:       exit:
-; DEFAULT-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ]
-; DEFAULT-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+; CHECK-LABEL: @pr45526
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK-EMPTY:
+; CHECK-NEXT: loop:
+; CHECK-NEXT:   %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %pivPlus1 = add nuw nsw i32 %piv, 1
+; CHECK-NEXT:   %cond = icmp ult i32 %piv, 510
+; CHECK-NEXT:   br i1 %cond, label %loop, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT: exit:
+; CHECK-NEXT:   %for.lcssa = phi i32 [ %for, %loop ]
+; CHECK-NEXT:   ret i32 %for.lcssa
 ;
 entry:
   br label %loop
@@ -603,56 +190,21 @@ exit:
 }
 
 define i32 @pr45526_pgso() !prof !14 {
-; PGSO-LABEL: @pr45526_pgso(
-; PGSO-NEXT:  entry:
-; PGSO-NEXT:    br label [[LOOP:%.*]]
-; PGSO:       loop:
-; PGSO-NEXT:    [[PIV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ]
-; PGSO-NEXT:    [[FOR:%.*]] = phi i32 [ 5, [[ENTRY]] ], [ [[PIVPLUS1]], [[LOOP]] ]
-; PGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; PGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; PGSO-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
-; PGSO:       exit:
-; PGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ]
-; PGSO-NEXT:    ret i32 [[FOR_LCSSA]]
 ;
-; NPGSO-LABEL: @pr45526_pgso(
-; NPGSO-NEXT:  entry:
-; NPGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NPGSO:       vector.ph:
-; NPGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NPGSO:       vector.body:
-; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; NPGSO-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; NPGSO-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; NPGSO-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; NPGSO-NEXT:    [[TMP4]] = add nuw nsw <4 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1>
-; NPGSO-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
-; NPGSO-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; NPGSO:       middle.block:
-; NPGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 511, 508
-; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; NPGSO-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; NPGSO:       scalar.ph:
-; NPGSO-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 5, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 508, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; NPGSO-NEXT:    br label [[LOOP:%.*]]
-; NPGSO:       loop:
-; NPGSO-NEXT:    [[PIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ]
-; NPGSO-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PIVPLUS1]], [[LOOP]] ]
-; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
-; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP28:![0-9]+]]
-; NPGSO:       exit:
-; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-LABEL: @pr45526_pgso
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK-EMPTY:
+; CHECK-NEXT: loop:
+; CHECK-NEXT:   %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %pivPlus1 = add nuw nsw i32 %piv, 1
+; CHECK-NEXT:   %cond = icmp ult i32 %piv, 510
+; CHECK-NEXT:   br i1 %cond, label %loop, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT: exit:
+; CHECK-NEXT:   %for.lcssa = phi i32 [ %for, %loop ]
+; CHECK-NEXT:   ret i32 %for.lcssa
 ;
 entry:
   br label %loop
@@ -673,102 +225,52 @@ exit:
 
 ; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py
 define void @stride1(i16* noalias %B, i32 %BStride) optsize {
+; CHECK-LABEL: @stride1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[BSTRIDE:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 1024, i32 1024>
+; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    store i16 42, i16* [[TMP4]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]]
+; CHECK-NEXT:    store i16 42, i16* [[TMP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !21
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 ; PGSO-LABEL: @stride1(
 ; PGSO-NEXT:  entry:
-; PGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PGSO:       vector.ph:
-; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[BSTRIDE:%.*]], i32 0
-; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; PGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PGSO:       vector.body:
-; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 1024, i32 1024>
-; PGSO-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; PGSO-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; PGSO:       pred.store.if:
-; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]]
-; PGSO-NEXT:    store i16 42, i16* [[TMP4]], align 4
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; PGSO:       pred.store.continue:
-; PGSO-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; PGSO-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; PGSO:       pred.store.if1:
-; PGSO-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]]
-; PGSO-NEXT:    store i16 42, i16* [[TMP7]], align 4
-; PGSO-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; PGSO:       pred.store.continue2:
-; PGSO-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; PGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; PGSO-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; PGSO:       middle.block:
-; PGSO-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PGSO:       scalar.ph:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1026, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; PGSO:       for.body:
-; PGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; PGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; PGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]]
-; PGSO-NEXT:    store i16 42, i16* [[GEPOFB]], align 4
-; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; PGSO:       for.end:
-; PGSO-NEXT:    ret void
+; PGSO-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
 ;
 ; NPGSO-LABEL: @stride1(
 ; NPGSO-NEXT:  entry:
-; NPGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NPGSO:       vector.ph:
-; NPGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[BSTRIDE:%.*]], i32 0
-; NPGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; NPGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NPGSO:       vector.body:
-; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 1024, i32 1024>
-; NPGSO-NEXT:    [[TMP1:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; NPGSO-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; NPGSO:       pred.store.if:
-; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]]
-; NPGSO-NEXT:    store i16 42, i16* [[TMP4]], align 4
-; NPGSO-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; NPGSO:       pred.store.continue:
-; NPGSO-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; NPGSO-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; NPGSO:       pred.store.if1:
-; NPGSO-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; NPGSO-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]]
-; NPGSO-NEXT:    store i16 42, i16* [[TMP7]], align 4
-; NPGSO-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; NPGSO:       pred.store.continue2:
-; NPGSO-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; NPGSO-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; NPGSO-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; NPGSO:       middle.block:
-; NPGSO-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; NPGSO:       scalar.ph:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1026, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; NPGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; NPGSO:       for.body:
-; NPGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NPGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; NPGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]]
-; NPGSO-NEXT:    store i16 42, i16* [[GEPOFB]], align 4
-; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; NPGSO:       for.end:
-; NPGSO-NEXT:    ret void
-;
+; NPGSO-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
+
 entry:
   br label %for.body
 
@@ -788,78 +290,15 @@ for.end:
 ; Vectorize with versioning for unit stride for PGSO and enabled vectorization.
 ;
 define void @stride1_pgso(i16* noalias %B, i32 %BStride) !prof !14 {
+; CHECK-LABEL: @stride1_pgso(
+; CHECK: vector.body
+;
 ; PGSO-LABEL: @stride1_pgso(
-; PGSO-NEXT:  entry:
-; PGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; PGSO:       vector.scevcheck:
-; PGSO-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[BSTRIDE:%.*]], 1
-; PGSO-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; PGSO:       vector.ph:
-; PGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PGSO:       vector.body:
-; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; PGSO-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[TMP0]], [[BSTRIDE]]
-; PGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP1]]
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <2 x i16>*
-; PGSO-NEXT:    store <2 x i16> <i16 42, i16 42>, <2 x i16>* [[TMP4]], align 4
-; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; PGSO-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; PGSO-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; PGSO:       middle.block:
-; PGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1025, 1024
-; PGSO-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; PGSO:       scalar.ph:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; PGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; PGSO:       for.body:
-; PGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; PGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; PGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]]
-; PGSO-NEXT:    store i16 42, i16* [[GEPOFB]], align 4
-; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; PGSO:       for.end:
-; PGSO-NEXT:    ret void
+; PGSO: vector.body
 ;
 ; NPGSO-LABEL: @stride1_pgso(
-; NPGSO-NEXT:  entry:
-; NPGSO-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; NPGSO:       vector.scevcheck:
-; NPGSO-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[BSTRIDE:%.*]], 1
-; NPGSO-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; NPGSO:       vector.ph:
-; NPGSO-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NPGSO:       vector.body:
-; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; NPGSO-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[TMP0]], [[BSTRIDE]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP1]]
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; NPGSO-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <2 x i16>*
-; NPGSO-NEXT:    store <2 x i16> <i16 42, i16 42>, <2 x i16>* [[TMP4]], align 4
-; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; NPGSO-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; NPGSO-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; NPGSO:       middle.block:
-; NPGSO-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1025, 1024
-; NPGSO-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; NPGSO:       scalar.ph:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; NPGSO-NEXT:    br label [[FOR_BODY:%.*]]
-; NPGSO:       for.body:
-; NPGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NPGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; NPGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]]
-; NPGSO-NEXT:    store i16 42, i16* [[GEPOFB]], align 4
-; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; NPGSO:       for.end:
-; NPGSO-NEXT:    ret void
-;
+; NPGSO: vector.body
+
 entry:
   br label %for.body
 
@@ -878,24 +317,14 @@ for.end:
 
 ; PR46652: Check that the need for stride==1 check prevents vectorizing a loop
 ; having tiny trip count, when compiling w/o -Os/-Oz.
+; CHECK-LABEL: @pr46652
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body
+; CHECK-LABEL: for.body
 
 @g = external global [1 x i16], align 1
 
 define void @pr46652(i16 %stride) {
-; DEFAULT-LABEL: @pr46652(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[L1_02:%.*]] = phi i16 [ 1, [[ENTRY:%.*]] ], [ [[INC9:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul nsw i16 [[L1_02]], [[STRIDE:%.*]]
-; DEFAULT-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* @g, i16 0, i16 [[MUL]]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX6]], align 1
-; DEFAULT-NEXT:    [[INC9]] = add nuw nsw i16 [[L1_02]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i16 [[INC9]], 16
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; DEFAULT:       for.end:
-; DEFAULT-NEXT:    ret void
-;
 entry:
   br label %for.body
 
@@ -915,22 +344,8 @@ for.end:                                        ; preds = %for.body
 ; Make sure we do not crash while building the VPlan for the loop with the
 ; select below.
 define i32 @PR48142(i32* %ptr.start, i32* %ptr.end) optsize {
-; DEFAULT-LABEL: @PR48142(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[I_014:%.*]] = phi i32 [ 20, [[ENTRY:%.*]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[PTR_IV:%.*]] = phi i32* [ [[PTR_START:%.*]], [[ENTRY]] ], [ [[PTR_NEXT:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[I_014]], 99
-; DEFAULT-NEXT:    [[COND]] = select i1 [[CMP4]], i32 99, i32 [[I_014]]
-; DEFAULT-NEXT:    store i32 0, i32* [[PTR_IV]], align 4
-; DEFAULT-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1
-; DEFAULT-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32* [[PTR_NEXT]], [[PTR_END:%.*]]
-; DEFAULT-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; DEFAULT:       exit:
-; DEFAULT-NEXT:    [[RES:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    ret i32 [[RES]]
-;
+; CHECK-LABEL: PR48142
+; CHECK-NOT: vector.body
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
index eee01deb737d..e19f4aa85c02 100644
--- a/llvm/test/Transforms/LoopVectorize/tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This test verifies that the loop vectorizer will not vectorizes low trip count
 ; loops that require runtime checks (Trip count is computed with profile info).
 ; REQUIRES: asserts
@@ -10,114 +9,9 @@ target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
 
 define i32 @foo_low_trip_count1(i32 %bound) {
 ; Simple loop with low tripcount. Should not be vectorized.
+
 ; CHECK-LABEL: @foo_low_trip_count1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP15]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP19]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i8> [ [[TMP17]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP23]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <4 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP26]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i8> [[TMP27]], i32 0
-; CHECK-NEXT:    store i8 [[TMP29]], i8* [[TMP6]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i8> [[TMP27]], i32 1
-; CHECK-NEXT:    store i8 [[TMP31]], i8* [[TMP7]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i8> [[TMP27]], i32 2
-; CHECK-NEXT:    store i8 [[TMP33]], i8* [[TMP8]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i8> [[TMP27]], i32 3
-; CHECK-NEXT:    store i8 [[TMP35]], i8* [[TMP9]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP37]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -140,114 +34,9 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
 ; The loop has a same invocation count with the function, but has a low
 ; trip_count per invocation and not worth to vectorize.
+
 ; CHECK-LABEL: @foo_low_trip_count2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP15]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP19]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i8> [ [[TMP17]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP23]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <4 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP26]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i8> [[TMP27]], i32 0
-; CHECK-NEXT:    store i8 [[TMP29]], i8* [[TMP6]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i8> [[TMP27]], i32 1
-; CHECK-NEXT:    store i8 [[TMP31]], i8* [[TMP7]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i8> [[TMP27]], i32 2
-; CHECK-NEXT:    store i8 [[TMP33]], i8* [[TMP8]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i8> [[TMP27]], i32 3
-; CHECK-NEXT:    store i8 [[TMP35]], i8* [[TMP9]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP37]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !prof [[PROF3]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -270,52 +59,12 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
 ; The loop has low invocation count compare to the function invocation count,
 ; but has a high trip count per invocation. Vectorize it.
+
 ; CHECK-LABEL: @foo_low_trip_count3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR_PREHEADER:%.*]], label [[FOR_END:%.*]], !prof [[PROF9:![0-9]+]]
-; CHECK:       for.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !prof [[PROF12:![0-9]+]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
 entry:
   br i1 %cond, label %for.preheader, label %for.end, !prof !2
 
@@ -340,115 +89,9 @@ for.end:                                          ; preds = %for.body
 define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
 ; Simple loop with low tripcount and inequality test for exit.
 ; Should not be vectorized.
+
 ; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[BOUND:%.*]], i32 -1)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SMAX]], 2
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8> poison, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP12]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[TMP15]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[TMP19]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i8> [ [[TMP17]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP23]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <4 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP26]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i8> [[TMP27]], i32 0
-; CHECK-NEXT:    store i8 [[TMP29]], i8* [[TMP6]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i8> [[TMP27]], i32 1
-; CHECK-NEXT:    store i8 [[TMP31]], i8* [[TMP7]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i8> [[TMP27]], i32 2
-; CHECK-NEXT:    store i8 [[TMP33]], i8* [[TMP8]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i8> [[TMP27]], i32 3
-; CHECK-NEXT:    store i8 [[TMP35]], i8* [[TMP9]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP37]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sgt i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !prof [[PROF3]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -470,107 +113,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_low_trip_count() {
 ; Simple loop with constant, small trip count and no profiling info.
-; CHECK-LABEL: @const_low_trip_count(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE12]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[TMP14]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP18]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP16]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP22]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i8> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq <4 x i8> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP25]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
-; CHECK-NEXT:    store i8 [[TMP28]], i8* [[TMP5]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i8> [[TMP26]], i32 1
-; CHECK-NEXT:    store i8 [[TMP30]], i8* [[TMP6]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP26]], i32 2
-; CHECK-NEXT:    store i8 [[TMP32]], i8* [[TMP7]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i8> [[TMP26]], i32 3
-; CHECK-NEXT:    store i8 [[TMP34]], i8* [[TMP8]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP36]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 2
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -592,44 +137,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_large_trip_count() {
 ; Simple loop with constant large trip count and no profiling info.
-; CHECK-LABEL: @const_large_trip_count(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -651,109 +161,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_small_trip_count_step() {
 ; Simple loop with static, small trip count and no profiling info.
-; CHECK-LABEL: @const_small_trip_count_step(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 10
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 15
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IV]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP10]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[TMP14]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[TMP18]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i8> [ [[TMP16]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP22]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i8> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq <4 x i8> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP25]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
-; CHECK-NEXT:    store i8 [[TMP28]], i8* [[TMP5]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i8> [[TMP26]], i32 1
-; CHECK-NEXT:    store i8 [[TMP30]], i8* [[TMP6]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[TMP26]], i32 2
-; CHECK-NEXT:    store i8 [[TMP32]], i8* [[TMP7]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i8> [[TMP26]], i32 3
-; CHECK-NEXT:    store i8 [[TMP34]], i8* [[TMP8]], align 1
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP36]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 5
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 10
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -775,44 +185,9 @@ for.end:                                          ; preds = %for.body
 
 define i32 @const_trip_over_profile() {
 ; constant trip count takes precedence over profile data
-; CHECK-LABEL: @const_trip_over_profile(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !prof [[PROF3]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
 
 entry:
   br label %for.body
@@ -832,6 +207,8 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
+; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
 ; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
 ; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
 ; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index dd8e02013e2c..1c94abe7bce1 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -702,6 +702,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0:
+; CHECK-NEXT:   WIDEN ir<%mul> = mul vp<[[PRED1]]>, vp<[[PRED2]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK]]> ir<%c.0> ir<false>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
@@ -712,7 +713,6 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-NEXT:   CondBit: vp<[[MASK2]]> (then.0)
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%mul> = mul vp<[[PRED1]]>, vp<[[PRED2]]>
 ; CHECK-NEXT:     REPLICATE ir<%gep.c.1> = getelementptr ir<@c>, ir<0>, ir<%iv>
 ; CHECK-NEXT:     REPLICATE store ir<%mul>, ir<%gep.c.1>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
@@ -732,8 +732,6 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
 ;
 entry:
   br label %loop


        


More information about the llvm-commits mailing list