[llvm] 4d847bf - [LV] Do not add load to group if it moves across conflicting store.

Fri Jul 7 03:06:53 PDT 2023

Author: Florian Hahn
Date: 2023-07-07T11:06:30+01:00
New Revision: 4d847bf4d06589ff90648f3857ae73a12950d16c

URL: https://github.com/llvm/llvm-project/commit/4d847bf4d06589ff90648f3857ae73a12950d16c
DIFF: https://github.com/llvm/llvm-project/commit/4d847bf4d06589ff90648f3857ae73a12950d16c.diff

LOG: [LV] Do not add load to group if it moves across conflicting store.

This patch prevents invalid load groups from being formed, where a load
needs to be moved across a conflicting store.

Once we hit a store that conflicts with a load with an existing
interleave group, we need to stop adding earlier loads to the group, as
this would force hoisting the previous stores in the group across the
conflicting load.

To detect such cases, add a new CompletedLoadGroups set, which is used
to keep track of load groups to which no earlier loads can be added.

Fixes https://github.com/llvm/llvm-project/issues/63602

Reviewed By: anna

Differential Revision: https://reviews.llvm.org/D154309

Added: 
    

Modified: 
    llvm/lib/Analysis/VectorUtils.cpp
    llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 5da21e75a20cbe..04387221cf8e01 100644

--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1107,6 +1107,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
   SmallSetVector<InterleaveGroup<Instruction> *, 4> StoreGroups;
   // Holds all interleaved load groups temporarily.
   SmallSetVector<InterleaveGroup<Instruction> *, 4> LoadGroups;
+  // Groups added to this set cannot have new members added.
+  SmallPtrSet<InterleaveGroup<Instruction> *, 4> CompletedLoadGroups;
 
   // Search in bottom-up program order for pairs of accesses (A and B) that can
   // form interleaved load or store groups. In the algorithm below, access A
@@ -1139,8 +1141,12 @@ void InterleavedAccessInfo::analyzeInterleaving(
       }
       if (B->mayWriteToMemory())
         StoreGroups.insert(Group);
-      else
+      else {
+        // Skip B if no new instructions can be added to its load group.
+        if (CompletedLoadGroups.contains(Group))
+          continue;
         LoadGroups.insert(Group);
+      }
     }
 
     for (auto AI = std::next(BI); AI != E; ++AI) {
@@ -1181,6 +1187,18 @@ void InterleavedAccessInfo::analyzeInterleaving(
           StoreGroups.remove(StoreGroup);
           releaseGroup(StoreGroup);
         }
+        // If B is a load and part of an interleave group, no earlier loads can
+        // be added to B's interleave group, because this would mean the load B
+        // would need to be moved across store A. Mark the interleave group as
+        // complete.
+        if (isInterleaved(B) && isa<LoadInst>(B)) {
+          InterleaveGroup<Instruction> *LoadGroup = getInterleaveGroup(B);
+
+          LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
+                            << " as complete.\n");
+
+          CompletedLoadGroups.insert(LoadGroup);
+        }
 
         // If a dependence exists and A is not already in a group (or it was
         // and we just released it), B might be hoisted above A (if B is a

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
index 0ca233a16da61e..36fd6a297be5ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -28,38 +28,41 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX2]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <12 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC3]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC3]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[TMP21:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP21]], i32 1
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP21]], i32 2
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3
+; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 17, 16
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ]
@@ -81,7 +84,7 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_IV_2]], align 4
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -110,4 +113,3 @@ loop:
 exit:
   ret void
 }
-