[llvm] fd9cd34 - Revert "[InstCombine] fold opposite shifts around an add"

Thu Dec 30 08:55:46 PST 2021

Author: Sanjay Patel
Date: 2021-12-30T11:54:55-05:00
New Revision: fd9cd3408baff99e4982be5357057909b7b2b005

URL: https://github.com/llvm/llvm-project/commit/fd9cd3408baff99e4982be5357057909b7b2b005
DIFF: https://github.com/llvm/llvm-project/commit/fd9cd3408baff99e4982be5357057909b7b2b005.diff

LOG: Revert "[InstCombine] fold opposite shifts around an add"

This reverts commit 2e3e0a5c288041745f88a06e66a831c236a3bb1f.
Some unintended diffs snuck into this patch.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
    llvm/test/Transforms/InstCombine/lshr.ll
    llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 61be99d2b9f05..324b7dcfb3aca 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -84,13 +84,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   /// Estimate a cost of Broadcast as an extract and sequence of insert
   /// operations.
-  InstructionCost getBroadcastShuffleOverhead(VectorType *VTy) {
+  InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
     InstructionCost Cost = 0;
     // Broadcast cost is equal to the cost of extracting the zero'th element
     // plus the cost of inserting it into every element of the result vector.
     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
 
-    for (int i = 0, e = VTy->getElementCount().getKnownMinValue(); i < e; ++i) {
+    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
     }
     return Cost;
@@ -875,7 +875,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     switch (improveShuffleKindFromMask(Kind, Mask)) {
     case TTI::SK_Broadcast:
-      return getBroadcastShuffleOverhead(Tp);
+      return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
     case TTI::SK_Select:
     case TTI::SK_Splice:
     case TTI::SK_Reverse:

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 0ade25f768253..06421d5539156 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1057,21 +1057,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
-    // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C)
-    // TODO: Consolidate with the more general transform that starts from shl
-    //       (the shifts are in the opposite order).
-    Value *Y;
-    if (match(Op0,
-              m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))),
-                               m_Value(Y))))) {
-      Value *NewLshr = Builder.CreateLShr(Y, Op1);
-      Value *NewAdd = Builder.CreateAdd(NewLshr, X);
-      unsigned Op1Val = C->getLimitedValue(BitWidth);
-      APInt Bits = APInt::getLowBitsSet(BitWidth, BitWidth - Op1Val);
-      Constant *Mask = ConstantInt::get(Ty, Bits);
-      return BinaryOperator::CreateAnd(NewAdd, Mask);
-    }
-
     if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
         (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
       assert(ShAmtC < X->getType()->getScalarSizeInBits() &&
@@ -1109,6 +1094,7 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
+    Value *Y;
     if (ShAmtC == BitWidth - 1) {
       // lshr i32 or(X,-X), 31 --> zext (X != 0)
       if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X)))))

diff  --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
index a158f5ecba92c..bfad4bcd27ab3 100644
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -138,8 +138,9 @@ define i8 @lshr_cttz_zero_is_undef_vec(<2 x i8> %x) {
 
 define i8 @lshr_exact(i8 %x) {
 ; CHECK-LABEL: @lshr_exact(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X:%.*]], 1
-; CHECK-NEXT:    [[LSHR:%.*]] = and i8 [[TMP1]], 63
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SHL]], 4
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i8 [[ADD]], 2
 ; CHECK-NEXT:    ret i8 [[LSHR]]
 ;
   %shl = shl i8 %x, 2
@@ -150,8 +151,9 @@ define i8 @lshr_exact(i8 %x) {
 
 define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_exact_splat_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i8> [[X:%.*]], <i8 1, i8 1>
-; CHECK-NEXT:    [[LSHR:%.*]] = and <2 x i8> [[TMP1]], <i8 63, i8 63>
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i8> [[X:%.*]], <i8 2, i8 2>
+; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i8> [[SHL]], <i8 4, i8 4>
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact <2 x i8> [[ADD]], <i8 2, i8 2>
 ; CHECK-NEXT:    ret <2 x i8> [[LSHR]]
 ;
   %shl = shl <2 x i8> %x, <i8 2, i8 2>
@@ -162,9 +164,9 @@ define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
 
 define i8 @shl_add(i8 %x, i8 %y) {
 ; CHECK-LABEL: @shl_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[Y:%.*]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP2]], 63
+; CHECK-NEXT:    [[L:%.*]] = shl i8 [[X:%.*]], 2
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[L]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i8 [[A]], 2
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %l = shl i8 %x, 2
@@ -176,9 +178,9 @@ define i8 @shl_add(i8 %x, i8 %y) {
 define <2 x i8> @shl_add_commute_vec(<2 x i8> %x, <2 x i8> %py) {
 ; CHECK-LABEL: @shl_add_commute_vec(
 ; CHECK-NEXT:    [[Y:%.*]] = mul <2 x i8> [[PY:%.*]], [[PY]]
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i8> [[Y]], <i8 3, i8 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[TMP2]], <i8 31, i8 31>
+; CHECK-NEXT:    [[L:%.*]] = shl <2 x i8> [[X:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[Y]], [[L]]
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i8> [[A]], <i8 3, i8 3>
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %y = mul <2 x i8> %py, %py ; thwart complexity-based canonicalization

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index c1363547ad989..fbdd6b539d216 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -1406,23 +1406,25 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; AVX512:       for.body.lr.ph:
 ; AVX512-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
 ; AVX512-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
-; AVX512-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IDX_EXT]], 4611686018427387903
-; AVX512-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4611686018427387903
-; AVX512-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 15
+; AVX512-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
+; AVX512-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
+; AVX512-NEXT:    [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 2
+; AVX512-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 60
 ; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
-; AVX512-NEXT:    [[TMP3:%.*]] = add nsw i64 [[IDX_EXT]], 4611686018427387903
-; AVX512-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], 4611686018427387903
-; AVX512-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP3]], 4
-; AVX512-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], 2
-; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP6]]
-; AVX512-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP7]]
+; AVX512-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
+; AVX512-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
+; AVX512-NEXT:    [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 2
+; AVX512-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP5]], 2
+; AVX512-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 2
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP8]]
+; AVX512-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
 ; AVX512-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; AVX512-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[IDX_EXT]]
-; AVX512-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
+; AVX512-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; AVX512-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]]
+; AVX512-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]]
 ; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[DEST]]
 ; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[PTR]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -1432,117 +1434,117 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
 ; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]]
 ; AVX512:       vector.ph:
-; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775792
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775792
 ; AVX512-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]]
-; AVX512-NEXT:    [[TMP10:%.*]] = shl i64 [[N_VEC]], 4
-; AVX512-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP10]]
-; AVX512-NEXT:    [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -16
-; AVX512-NEXT:    [[TMP12:%.*]] = lshr exact i64 [[TMP11]], 4
-; AVX512-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 1
-; AVX512-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP13]], 7
-; AVX512-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP11]], 112
-; AVX512-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
+; AVX512-NEXT:    [[TMP12:%.*]] = shl i64 [[N_VEC]], 4
+; AVX512-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]]
+; AVX512-NEXT:    [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -16
+; AVX512-NEXT:    [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 4
+; AVX512-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1
+; AVX512-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP15]], 7
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 112
+; AVX512-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
 ; AVX512:       vector.ph.new:
-; AVX512-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP13]], 2305843009213693944
+; AVX512-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 2305843009213693944
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_7:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_7:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP17]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP18:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP18]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP15]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP19]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP17]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP20:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP20]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP17]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP21]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 256
 ; AVX512-NEXT:    [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]]
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, float* [[PTR_IND]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP21]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x float>, <16 x float>* [[TMP22]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_1]], <16 x float*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[NEXT_GEP_1]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_1:%.*]] = load <16 x float>, <16 x float>* [[TMP23]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP20]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_1]], <16 x float*> [[TMP24]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[PTR_IND]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x float>, <16 x float>* [[TMP24]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_1]], <16 x float*> [[TMP22]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast float* [[NEXT_GEP_1]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_1:%.*]] = load <16 x float>, <16 x float>* [[TMP25]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP22]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_1]], <16 x float*> [[TMP26]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 512
 ; AVX512-NEXT:    [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]]
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[PTR_IND_1]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x float>, <16 x float>* [[TMP27]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_2]], <16 x float*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP28:%.*]] = bitcast float* [[NEXT_GEP_2]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_2:%.*]] = load <16 x float>, <16 x float>* [[TMP28]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP25]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_2]], <16 x float*> [[TMP29]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr float, float* [[PTR_IND_1]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x float>, <16 x float>* [[TMP29]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_2]], <16 x float*> [[TMP27]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP30:%.*]] = bitcast float* [[NEXT_GEP_2]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_2:%.*]] = load <16 x float>, <16 x float>* [[TMP30]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP27]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_2]], <16 x float*> [[TMP31]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48
 ; AVX512-NEXT:    [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 768
 ; AVX512-NEXT:    [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]]
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[PTR_IND_2]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP32:%.*]] = bitcast float* [[TMP31]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x float>, <16 x float>* [[TMP32]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_3]], <16 x float*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[NEXT_GEP_3]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_3:%.*]] = load <16 x float>, <16 x float>* [[TMP33]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP30]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_3]], <16 x float*> [[TMP34]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[PTR_IND_2]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP34:%.*]] = bitcast float* [[TMP33]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x float>, <16 x float>* [[TMP34]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_3]], <16 x float*> [[TMP32]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast float* [[NEXT_GEP_3]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_3:%.*]] = load <16 x float>, <16 x float>* [[TMP35]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP32]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_3]], <16 x float*> [[TMP36]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_3:%.*]] = or i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[PTR_IND_3:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1024
 ; AVX512-NEXT:    [[NEXT_GEP_4:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_3]]
-; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr float, float* [[PTR_IND_3]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_4]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_4:%.*]] = load <16 x float>, <16 x float>* [[TMP37]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_4]], <16 x float*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP38:%.*]] = bitcast float* [[NEXT_GEP_4]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_4:%.*]] = load <16 x float>, <16 x float>* [[TMP38]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP35]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_4]], <16 x float*> [[TMP39]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr float, float* [[PTR_IND_3]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_4]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_4:%.*]] = load <16 x float>, <16 x float>* [[TMP39]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_4]], <16 x float*> [[TMP37]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP40:%.*]] = bitcast float* [[NEXT_GEP_4]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_4:%.*]] = load <16 x float>, <16 x float>* [[TMP40]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP37]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_4]], <16 x float*> [[TMP41]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_4:%.*]] = or i64 [[INDEX]], 80
 ; AVX512-NEXT:    [[PTR_IND_4:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1280
 ; AVX512-NEXT:    [[NEXT_GEP_5:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_4]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr float, float* [[PTR_IND_4]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_5]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP42:%.*]] = bitcast float* [[TMP41]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_5:%.*]] = load <16 x float>, <16 x float>* [[TMP42]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_5]], <16 x float*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP43:%.*]] = bitcast float* [[NEXT_GEP_5]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_5:%.*]] = load <16 x float>, <16 x float>* [[TMP43]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP40]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_5]], <16 x float*> [[TMP44]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[PTR_IND_4]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_5]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP44:%.*]] = bitcast float* [[TMP43]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_5:%.*]] = load <16 x float>, <16 x float>* [[TMP44]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_5]], <16 x float*> [[TMP42]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_5]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_5:%.*]] = load <16 x float>, <16 x float>* [[TMP45]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP42]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_5]], <16 x float*> [[TMP46]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_5:%.*]] = or i64 [[INDEX]], 96
 ; AVX512-NEXT:    [[PTR_IND_5:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1536
 ; AVX512-NEXT:    [[NEXT_GEP_6:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_5]]
-; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[PTR_IND_5]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_6]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_6:%.*]] = load <16 x float>, <16 x float>* [[TMP47]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_6]], <16 x float*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP48:%.*]] = bitcast float* [[NEXT_GEP_6]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_6:%.*]] = load <16 x float>, <16 x float>* [[TMP48]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP45]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_6]], <16 x float*> [[TMP49]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr float, float* [[PTR_IND_5]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_6]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_6:%.*]] = load <16 x float>, <16 x float>* [[TMP49]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_6]], <16 x float*> [[TMP47]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP50:%.*]] = bitcast float* [[NEXT_GEP_6]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_6:%.*]] = load <16 x float>, <16 x float>* [[TMP50]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP47]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_6]], <16 x float*> [[TMP51]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_6:%.*]] = or i64 [[INDEX]], 112
 ; AVX512-NEXT:    [[PTR_IND_6:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1792
 ; AVX512-NEXT:    [[NEXT_GEP_7:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_6]]
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[PTR_IND_6]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_7]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP52:%.*]] = bitcast float* [[TMP51]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_7:%.*]] = load <16 x float>, <16 x float>* [[TMP52]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_7]], <16 x float*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP53:%.*]] = bitcast float* [[NEXT_GEP_7]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_7:%.*]] = load <16 x float>, <16 x float>* [[TMP53]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP50]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_7]], <16 x float*> [[TMP54]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[PTR_IND_6]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_7]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP53]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_7:%.*]] = load <16 x float>, <16 x float>* [[TMP54]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_7]], <16 x float*> [[TMP52]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP55:%.*]] = bitcast float* [[NEXT_GEP_7]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_7:%.*]] = load <16 x float>, <16 x float>* [[TMP55]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP52]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_7]], <16 x float*> [[TMP56]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_7]] = add nuw i64 [[INDEX]], 128
 ; AVX512-NEXT:    [[PTR_IND_7]] = getelementptr float, float* [[POINTER_PHI]], i64 2048
 ; AVX512-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
@@ -1558,22 +1560,22 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; AVX512-NEXT:    [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ]
 ; AVX512-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ]
 ; AVX512-NEXT:    [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]]
-; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
-; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP57:%.*]] = bitcast float* [[TMP56]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP57]], align 4, !alias.scope !2
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_EPIL]], <16 x float*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
-; AVX512-NEXT:    [[TMP58:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_LOAD15_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP58]], align 4, !alias.scope !9
-; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP55]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_EPIL]], <16 x float*> [[TMP59]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]]
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast float* [[TMP58]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP59]], align 4, !alias.scope !2
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_EPIL]], <16 x float*> [[TMP57]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP60:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_LOAD15_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP60]], align 4, !alias.scope !9
+; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP57]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_EPIL]], <16 x float*> [[TMP61]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !5, !noalias !7
 ; AVX512-NEXT:    [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 16
 ; AVX512-NEXT:    [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 256
 ; AVX512-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
 ; AVX512-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
 ; AVX512-NEXT:    br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP11:![0-9]+]]
 ; AVX512:       middle.block:
-; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
 ; AVX512:       for.body.preheader:
 ; AVX512-NEXT:    [[PTR_ADDR_012_PH:%.*]] = phi float* [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
@@ -1583,11 +1585,11 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; AVX512-NEXT:    [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ]
 ; AVX512-NEXT:    [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP60:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    store float [[TMP60]], float* [[DEST_ADDR_011]], align 4
-; AVX512-NEXT:    [[TMP61:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
+; AVX512-NEXT:    [[TMP62:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    store float [[TMP62]], float* [[DEST_ADDR_011]], align 4
+; AVX512-NEXT:    [[TMP63:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
 ; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1
-; AVX512-NEXT:    store float [[TMP61]], float* [[ARRAYIDX5]], align 4
+; AVX512-NEXT:    store float [[TMP63]], float* [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1
 ; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16
 ; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]]
@@ -1604,23 +1606,25 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; FVW2:       for.body.lr.ph:
 ; FVW2-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
 ; FVW2-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
-; FVW2-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IDX_EXT]], 4611686018427387903
-; FVW2-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4611686018427387903
-; FVW2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; FVW2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 3
+; FVW2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
+; FVW2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
+; FVW2-NEXT:    [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 2
+; FVW2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; FVW2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 12
 ; FVW2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; FVW2:       vector.memcheck:
-; FVW2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[IDX_EXT]], 4611686018427387903
-; FVW2-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], 4611686018427387903
-; FVW2-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP3]], 4
-; FVW2-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], 2
-; FVW2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP6]]
-; FVW2-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; FVW2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP7]]
+; FVW2-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
+; FVW2-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
+; FVW2-NEXT:    [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 2
+; FVW2-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP5]], 2
+; FVW2-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], 2
+; FVW2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST:%.*]], i64 [[TMP8]]
+; FVW2-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; FVW2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
 ; FVW2-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 1
-; FVW2-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[IDX_EXT]]
-; FVW2-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
+; FVW2-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; FVW2-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]]
+; FVW2-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]]
 ; FVW2-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[DEST]]
 ; FVW2-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[PTR]]
 ; FVW2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -1630,61 +1634,61 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; FVW2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
 ; FVW2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]]
 ; FVW2:       vector.ph:
-; FVW2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
+; FVW2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804
 ; FVW2-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]]
-; FVW2-NEXT:    [[TMP10:%.*]] = shl i64 [[N_VEC]], 4
-; FVW2-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP10]]
+; FVW2-NEXT:    [[TMP12:%.*]] = shl i64 [[N_VEC]], 4
+; FVW2-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]]
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
 ; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]]
-; FVW2-NEXT:    [[TMP11:%.*]] = shl i64 [[INDEX]], 4
-; FVW2-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP11]]
-; FVW2-NEXT:    [[TMP12:%.*]] = shl i64 [[INDEX]], 4
-; FVW2-NEXT:    [[TMP13:%.*]] = or i64 [[TMP12]], 16
-; FVW2-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP13]]
+; FVW2-NEXT:    [[TMP13:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP13]]
 ; FVW2-NEXT:    [[TMP14:%.*]] = shl i64 [[INDEX]], 4
-; FVW2-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 32
-; FVW2-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP15]]
+; FVW2-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], 16
+; FVW2-NEXT:    [[NEXT_GEP17:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP15]]
 ; FVW2-NEXT:    [[TMP16:%.*]] = shl i64 [[INDEX]], 4
-; FVW2-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 48
-; FVW2-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP17]]
-; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !7
-; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP18]], i64 2
+; FVW2-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 32
+; FVW2-NEXT:    [[NEXT_GEP18:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP17]]
+; FVW2-NEXT:    [[TMP18:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], 48
+; FVW2-NEXT:    [[NEXT_GEP19:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP19]]
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
 ; FVW2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD20:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !7
-; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i64 0
-; FVW2-NEXT:    store float [[TMP22]], float* [[NEXT_GEP16]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i64 1
-; FVW2-NEXT:    store float [[TMP23]], float* [[NEXT_GEP17]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i64 0
-; FVW2-NEXT:    store float [[TMP24]], float* [[NEXT_GEP18]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP25:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i64 1
-; FVW2-NEXT:    store float [[TMP25]], float* [[NEXT_GEP19]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP26:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4, !alias.scope !14
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2
-; FVW2-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
-; FVW2-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, <2 x float>* [[TMP28]], align 4, !alias.scope !14
-; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1
-; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP17]], i64 1
-; FVW2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP18]], i64 1
-; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP19]], i64 1
-; FVW2-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i64 0
-; FVW2-NEXT:    store float [[TMP33]], float* [[TMP29]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i64 1
-; FVW2-NEXT:    store float [[TMP34]], float* [[TMP30]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i64 0
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !7
+; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 2
+; FVW2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD20:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !7
+; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i64 0
+; FVW2-NEXT:    store float [[TMP24]], float* [[NEXT_GEP16]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP25:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i64 1
+; FVW2-NEXT:    store float [[TMP25]], float* [[NEXT_GEP17]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i64 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[NEXT_GEP18]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i64 1
+; FVW2-NEXT:    store float [[TMP27]], float* [[NEXT_GEP19]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP28:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, <2 x float>* [[TMP28]], align 4, !alias.scope !14
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2
+; FVW2-NEXT:    [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
+; FVW2-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, <2 x float>* [[TMP30]], align 4, !alias.scope !14
+; FVW2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1
+; FVW2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP17]], i64 1
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP18]], i64 1
+; FVW2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP19]], i64 1
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i64 0
 ; FVW2-NEXT:    store float [[TMP35]], float* [[TMP31]], align 4, !alias.scope !10, !noalias !12
-; FVW2-NEXT:    [[TMP36:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i64 1
+; FVW2-NEXT:    [[TMP36:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i64 1
 ; FVW2-NEXT:    store float [[TMP36]], float* [[TMP32]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP37:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i64 0
+; FVW2-NEXT:    store float [[TMP37]], float* [[TMP33]], align 4, !alias.scope !10, !noalias !12
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i64 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP34]], align 4, !alias.scope !10, !noalias !12
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FVW2-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; FVW2-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; FVW2-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; FVW2-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FVW2:       middle.block:
-; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
 ; FVW2:       for.body.preheader:
 ; FVW2-NEXT:    [[PTR_ADDR_012_PH:%.*]] = phi float* [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
@@ -1694,11 +1698,11 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly %ptr, fl
 ; FVW2-NEXT:    [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ]
 ; FVW2-NEXT:    [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ]
 ; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP38:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; FVW2-NEXT:    store float [[TMP38]], float* [[DEST_ADDR_011]], align 4
-; FVW2-NEXT:    [[TMP39:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
+; FVW2-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; FVW2-NEXT:    store float [[TMP40]], float* [[DEST_ADDR_011]], align 4
+; FVW2-NEXT:    [[TMP41:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
 ; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1
-; FVW2-NEXT:    store float [[TMP39]], float* [[ARRAYIDX5]], align 4
+; FVW2-NEXT:    store float [[TMP41]], float* [[ARRAYIDX5]], align 4
 ; FVW2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1
 ; FVW2-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16
 ; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]]