[llvm] e85fd3c - Revert "[LV] Complete load groups and release store groups in presence of dependency"

Wed Jul 26 12:07:40 PDT 2023

Author: Anna Thomas
Date: 2023-07-26T15:07:26-04:00
New Revision: e85fd3cbdd68a80fba855f0dbf8bc0a4abe791df

URL: https://github.com/llvm/llvm-project/commit/e85fd3cbdd68a80fba855f0dbf8bc0a4abe791df
DIFF: https://github.com/llvm/llvm-project/commit/e85fd3cbdd68a80fba855f0dbf8bc0a4abe791df.diff

LOG: Revert "[LV] Complete load groups and release store groups in presence of dependency"

This reverts commit eaf6117f3388615f51198e47c0d6be0252729508 (D155520).
There's an ASAN build failure that needs investigation.

Added: 
    

Modified: 
    llvm/lib/Analysis/VectorUtils.cpp
    llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
    llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 8cbe0bd5fc091b..87f0bb6904776c 100644

--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1158,11 +1158,14 @@ void InterleavedAccessInfo::analyzeInterleaving(
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
                           << '\n');
         GroupB = createInterleaveGroup(B, DesB.Stride, DesB.Alignment);
-        if (B->mayWriteToMemory())
-          StoreGroups.insert(GroupB);
-        else
-          LoadGroups.insert(GroupB);
+      } else if (CompletedLoadGroups.contains(GroupB)) {
+        // Skip B if no new instructions can be added to its load group.
+        continue;
       }
+      if (B->mayWriteToMemory())
+        StoreGroups.insert(GroupB);
+      else
+        LoadGroups.insert(GroupB);
     }
 
     for (auto AI = std::next(BI); AI != E; ++AI) {
@@ -1188,59 +1191,38 @@ void InterleavedAccessInfo::analyzeInterleaving(
       // Because accesses (2) and (3) are dependent, we can group (2) with (1)
       // but not with (4). If we did, the dependent access (3) would be within
       // the boundaries of the (2, 4) group.
-      //
-      auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
-                                 StrideEntry *A) -> Instruction * {
-        for (uint32_t Index = 0; Index < Group->getFactor(); ++Index) {
-          Instruction *MemberOfGroupB = Group->getMember(Index);
-          if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
-                                    A, &*AccessStrideInfo.find(MemberOfGroupB)))
-            return MemberOfGroupB;
+      if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
+        // If a dependence exists and A is already in a group, we know that A
+        // must be a store since A precedes B and WAR dependences are allowed.
+        // Thus, A would be sunk below B. We release A's group to prevent this
+        // illegal code motion. A will then be free to form another group with
+        // instructions that precede it.
+        if (isInterleaved(A)) {
+          InterleaveGroup<Instruction> *StoreGroup = getInterleaveGroup(A);
+
+          LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
+                               "dependence between " << *A << " and "<< *B << '\n');
+
+          StoreGroups.remove(StoreGroup);
+          releaseGroup(StoreGroup);
         }
-        return nullptr;
-      };
-
-      if (A->mayWriteToMemory()) { // Otherwise dependencies are tolerable.
-        Instruction *DependentInst = nullptr;
-        // If GroupB is a load group, we have to compare AI against all
-        // members of GroupB because if any load within GroupB has a dependency
-        // on AI, we need to mark GroupB as complete and also release the
-        // store GroupA (if A belongs to one). The former prevents incorrect
-        // hoisting of load B above store A while the latter prevents incorrect
-        // sinking of store A below load B.
-        if (GroupB && LoadGroups.contains(GroupB))
-          DependentInst = DependentMember(GroupB, &*AI);
-        else if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI))
-          DependentInst = B;
-
-        if (DependentInst) {
-          auto GroupA = getInterleaveGroup(A);
-          // A has a store dependence on B (or on some load within GroupB) and
-          // is part of a store group. Release A's group to prevent illegal
-          // sinking of A below B. A will then be free to form another group
-          // with instructions that precede it.
-          if (GroupA && StoreGroups.contains(GroupA)) {
-            LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
-                                 "dependence between "
-                              << *A << " and " << *DependentInst << '\n');
-            StoreGroups.remove(GroupA);
-            releaseGroup(GroupA);
-          }
-          // If B is a load and part of an interleave group, no earlier loads
-          // can be added to B's interleave group, because this would mean the
-          // DependentInst would move across store A. Mark the interleave group
-          // as complete.
-          if (GroupB && LoadGroups.contains(GroupB)) {
-            LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
-                              << " as complete.\n");
-            CompletedLoadGroups.insert(GroupB);
-          }
+        // If B is a load and part of an interleave group, no earlier loads can
+        // be added to B's interleave group, because this would mean the load B
+        // would need to be moved across store A. Mark the interleave group as
+        // complete.
+        if (GroupB && isa<LoadInst>(B)) {
+          LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
+                            << " as complete.\n");
+
+          CompletedLoadGroups.insert(GroupB);
         }
-      }
-      if (CompletedLoadGroups.contains(GroupB)) {
-        // Skip trying to add A to B, continue to look for other conflicting A's
-        // in groups to be released.
-        continue;
+
+        // If a dependence exists and A is not already in a group (or it was
+        // and we just released it), B might be hoisted above A (if B is a
+        // load) or another store might be sunk below A (if B is a store). In
+        // either case, we can't add additional instructions to B's group. B
+        // will only form a group with instructions that it precedes.
+        break;
       }
 
       // At this point, we've checked for illegal code motion. If either A or B

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
index b3eb55bc27a505..0280a8b3ab839a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -121,6 +121,7 @@ exit:
 ; compare against the obstructing stores (%l2 versus the store) there is no
 ; dependency. However, the other load in %l2's interleave group (%l3) does
 ; obstruct with the store.
+; FIXME: The test case is currently mis-compiled.
 define void @pr63602_2(ptr %arr) {
 ; CHECK-LABEL: define void @pr63602_2
 ; CHECK-SAME: (ptr [[ARR:%.*]]) {
@@ -139,64 +140,40 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = add i64 1, [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX2]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX2]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX2]], 6
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX2]], 9
-; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -2
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 0
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 1
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 2
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = add nuw nsw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP31]], i32 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP32]], i32 3
-; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP28]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3
-; CHECK-NEXT:    [[TMP45:%.*]] = add <4 x i32> [[TMP36]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
-; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 17, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ]
@@ -218,7 +195,7 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_IV_2]], align 4
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
index 1452675fdc72e4..bf53833a22ce0c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
@@ -3,9 +3,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-apple-macos"
 
-; %l2 load and the preceeding store has a dependency. We should not sink
+; This is currently miscompiled.
+; %l2 load and the preceeding store has a dependency. However, we currently sink
 ; that store into the last store (by creating an interleaved store group). This
-; means the loaded %l2 will have incorrect value.
+; means the loaded %l2 has incorrect value.
+; We do not release this store group correctly because the next interleave group
+; chosen compares only the memory access of last load in program (%l3) against the dependent store location
+; (%gep.iv.1.plus.2) and they are 
diff erent, thereby incorrectly assuming no
+; dependency. We need to compare against all loads in that interleaved group
+; (%l2 is part of it).
 define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-LABEL: define void @avoid_sinking_store_across_load
 ; CHECK-SAME: (ptr [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -22,28 +28,26 @@ define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -2
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[STRIDED_VEC]], <i32 25, i32 25, i32 25, i32 25>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[STRIDED_VEC5]], <i32 25, i32 25, i32 25, i32 25>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP8]], <4 x ptr> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <12 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP11]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 12, i64 12, i64 12, i64 12>
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], <i64 12, i64 12, i64 12, i64 12>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 17, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ]
@@ -66,7 +70,7 @@ define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_IV_2]], align 4
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;