[llvm] [SLP] Invariant loads cannot have a memory dependency on stores. (PR #167929)

Mon Nov 17 08:02:49 PST 2025

https://github.com/dstutt updated https://github.com/llvm/llvm-project/pull/167929

>From 513c98602fe4cbed34eddb5d2cbfbf242ad52e24 Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Thu, 13 Nov 2025 13:49:14 -0500
Subject: [PATCH 1/3] [SLP] Invariant loads cannot have a memory dependency on
 stores.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 19 ++++-
 .../AMDGPU/invariant-load-no-alias-store.ll   | 82 +++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc53b0dd3577e..e67e607ee1632 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21623,6 +21623,17 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
       }
     }
 
+    // Helper to detect loads marked with !invariant.load metadata. Such loads
+    // are defined to read from memory that never changes for the lifetime of
+    // the program; any store to the same location would be UB. Therefore we
+    // can conservatively treat an invariant load and any store as non-aliasing
+    // for scheduling/dep purposes and skip creating a dependency edge.
+    auto IsInvariantLoad = [](const Instruction *I) {
+      if (const auto *LI = dyn_cast<LoadInst>(I))
+        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+      return false;
+    };
+
     // Handle the memory dependencies (if any).
     ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
     if (!NextLoadStore)
@@ -21636,10 +21647,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
     unsigned DistToSrc = 1;
     bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
 
+    if (IsInvariantLoad(SrcInst))
+      return; // Invariant load cannot have memory dependencies.
+
     for (ScheduleData *DepDest = NextLoadStore; DepDest;
          DepDest = DepDest->getNextLoadStore()) {
       assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
 
+      Instruction *DestInst = DepDest->getInst();
+
       // We have two limits to reduce the complexity:
       // 1) AliasedCheckLimit: It's a small limit to reduce calls to
       //    SLP->isAliased (which is the expensive part in this loop).
@@ -21648,7 +21664,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
       //    It's important for the loop break condition (see below) to
       //    check this limit even between two read-only instructions.
       if (DistToSrc >= MaxMemDepDistance ||
-          ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
+          (!IsInvariantLoad(DestInst) && // Cannot have memory deps.
+           (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
            (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
             SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
new file mode 100644
index 0000000000000..00bd0feb8c0fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="function(slp-vectorizer)" -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 %s -S | FileCheck %s
+
+define void @test(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT:    [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT:    [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd reassoc <2 x half> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+  %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+  ; First pair of invariant loads from otherA.
+  %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+  %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+  %A0 = load half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0
+  %B0 = load half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0
+  %add0 = fadd reassoc half %A0, %B0
+  store half %add0, ptr addrspace(1) %p0, align 2
+  %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+  %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+  %A1 = load half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0
+  %B1 = load half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0
+  %add1 = fadd reassoc half %A1, %B1
+  store half %add1, ptr addrspace(1) %p1, align 2
+  ret void
+}
+
+
+define void @aliastest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @aliastest(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1
+; CHECK-NEXT:    [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT:    [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT:    [[A0:%.*]] = load half, ptr addrspace(1) [[A0PTR]], align 2
+; CHECK-NEXT:    [[B0:%.*]] = load half, ptr addrspace(1) [[B0PTR]], align 2
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]]
+; CHECK-NEXT:    store half [[ADD0]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT:    [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1
+; CHECK-NEXT:    [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1
+; CHECK-NEXT:    [[A1:%.*]] = load half, ptr addrspace(1) [[A1PTR]], align 2
+; CHECK-NEXT:    [[B1:%.*]] = load half, ptr addrspace(1) [[B1PTR]], align 2
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]]
+; CHECK-NEXT:    store half [[ADD1]], ptr addrspace(1) [[P1]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+  %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+  ; First pair of invariant loads from otherA.
+  %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+  %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+  %A0 = load half, ptr addrspace(1) %A0PTR, align 2
+  %B0 = load half, ptr addrspace(1) %B0PTR, align 2
+  %add0 = fadd reassoc half %A0, %B0
+  store half %add0, ptr addrspace(1) %p0, align 2
+  %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+  %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+  %A1 = load half, ptr addrspace(1) %A1PTR, align 2
+  %B1 = load half, ptr addrspace(1) %B1PTR, align 2
+  %add1 = fadd reassoc half %A1, %B1
+  store half %add1, ptr addrspace(1) %p1, align 2
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+
+!0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.

>From 4d014b8e1e3f004248eaefbd50353a4147bcd4cb Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Thu, 13 Nov 2025 22:10:06 -0500
Subject: [PATCH 2/3] Update location; exclude invariant (and simple) loads
 from memory inst scheduler data list. Add volatile test.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 30 ++++++--------
 .../AMDGPU/invariant-load-no-alias-store.ll   | 39 +++++++++++++++++++
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e67e607ee1632..8cf5f9e133b6e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21412,7 +21412,18 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
            "new ScheduleData already in scheduling region");
     SD->init(SchedulingRegionID, I);
 
+    auto CanIgnoreLoad = [](const Instruction *I) {
+      const LoadInst *LI = dyn_cast<LoadInst>(I);
+      // If there is a simple load marked as invariant, we can ignore it.
+      // But, in the (unlikely) case of non-simple invariant load,
+      // we should not ignore it.
+      return LI && LI->isSimple() &&
+             LI->getMetadata(LLVMContext::MD_invariant_load);
+    };
+
     if (I->mayReadOrWriteMemory() &&
+        // Simple InvariantLoad does not depend on other memory accesses.
+        !CanIgnoreLoad(I) &&
         (!isa<IntrinsicInst>(I) ||
          (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
           cast<IntrinsicInst>(I)->getIntrinsicID() !=
@@ -21623,17 +21634,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
       }
     }
 
-    // Helper to detect loads marked with !invariant.load metadata. Such loads
-    // are defined to read from memory that never changes for the lifetime of
-    // the program; any store to the same location would be UB. Therefore we
-    // can conservatively treat an invariant load and any store as non-aliasing
-    // for scheduling/dep purposes and skip creating a dependency edge.
-    auto IsInvariantLoad = [](const Instruction *I) {
-      if (const auto *LI = dyn_cast<LoadInst>(I))
-        return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
-      return false;
-    };
-
     // Handle the memory dependencies (if any).
     ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
     if (!NextLoadStore)
@@ -21647,15 +21647,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
     unsigned DistToSrc = 1;
     bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
 
-    if (IsInvariantLoad(SrcInst))
-      return; // Invariant load cannot have memory dependencies.
-
     for (ScheduleData *DepDest = NextLoadStore; DepDest;
          DepDest = DepDest->getNextLoadStore()) {
       assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
 
-      Instruction *DestInst = DepDest->getInst();
-
       // We have two limits to reduce the complexity:
       // 1) AliasedCheckLimit: It's a small limit to reduce calls to
       //    SLP->isAliased (which is the expensive part in this loop).
@@ -21664,8 +21659,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
       //    It's important for the loop break condition (see below) to
       //    check this limit even between two read-only instructions.
       if (DistToSrc >= MaxMemDepDistance ||
-          (!IsInvariantLoad(DestInst) && // Cannot have memory deps.
-           (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
+          ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
            (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
             SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
index 00bd0feb8c0fa..87537c05573ae 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
@@ -73,6 +73,45 @@ entry:
   ret void
 }
 
+define void @voltest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @voltest(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1
+; CHECK-NEXT:    [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT:    [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT:    [[A0:%.*]] = load volatile half, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT:    [[B0:%.*]] = load volatile half, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]]
+; CHECK-NEXT:    store half [[ADD0]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT:    [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1
+; CHECK-NEXT:    [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1
+; CHECK-NEXT:    [[A1:%.*]] = load volatile half, ptr addrspace(1) [[A1PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT:    [[B1:%.*]] = load volatile half, ptr addrspace(1) [[B1PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]]
+; CHECK-NEXT:    store half [[ADD1]], ptr addrspace(1) [[P1]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+  %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+  ; First pair of invariant loads from otherA.
+  %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+  %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+  %A0 = load volatile half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0
+  %B0 = load volatile half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0
+  %add0 = fadd reassoc half %A0, %B0
+  store half %add0, ptr addrspace(1) %p0, align 2
+  %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+  %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+  %A1 = load volatile half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0
+  %B1 = load volatile half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0
+  %add1 = fadd reassoc half %A1, %B1
+  store half %add1, ptr addrspace(1) %p1, align 2
+  ret void
+}
+
 
 attributes #0 = { nounwind }
 

>From d6fdd3d2f09812d3d2146b400da7bf6eb5804d96 Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Fri, 14 Nov 2025 14:38:50 -0500
Subject: [PATCH 3/3] Fix nit.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8cf5f9e133b6e..8231f5234eac2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21413,7 +21413,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
     SD->init(SchedulingRegionID, I);
 
     auto CanIgnoreLoad = [](const Instruction *I) {
-      const LoadInst *LI = dyn_cast<LoadInst>(I);
+      const auto *LI = dyn_cast<LoadInst>(I);
       // If there is a simple load marked as invariant, we can ignore it.
       // But, in the (unlikely) case of non-simple invariant load,
       // we should not ignore it.