[llvm] [DA] Cache delinearization results. NFCI. (PR #164379)

Tue Oct 21 02:14:05 PDT 2025

https://github.com/sjoerdmeijer created https://github.com/llvm/llvm-project/pull/164379

An instruction can appear in multiple source-destination dependency pairs. If this is the case, delinearization is requested and recomputed for the same instruction again and again. Instead, cache the delinearization and query the cache first before computing it. I made this observation while going through debug logs for DA, and wanted to test whether you like this idea or not before I try to measure whether this has a compile-time benefit, which is of course the reason to do this.

I was just looking at this example:

```
loop:
  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
  %subscript.0 = mul i64 %mk, %i
  %subscript.1 = add i64 %subscript.0, %kk.inc
  %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0 ; a[-k * i]
  %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1 ; a[-k * i + (2 * k + 1)]
  store i8 42, ptr %idx.0
  store i8 42, ptr %idx.1
  %i.next = add i64 %i, 1
  %cond.exit = icmp eq i64 %i.next, 3
  br i1 %cond.exit, label %exit, label %loop
```

and noticed that we delinearize first this:

```
Src:  store i8 42, ptr %idx.0, align 1 --> Dst:  store i8 42, ptr %idx.0, align 1
  da analyze -     SrcSCEV = {%a,+,(-1 * %k)}<%loop>
    DstSCEV = {%a,+,(-1 * %k)}<%loop>

GEP to delinearize:   %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
```

then this:

```
Src:  store i8 42, ptr %idx.0, align 1 --> Dst:  store i8 42, ptr %idx.1, align 1
  da analyze -     SrcSCEV = {%a,+,(-1 * %k)}<%loop>
    DstSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>

GEP to delinearize:   %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
```

and then this:

```
Src:  store i8 42, ptr %idx.1, align 1 --> Dst:  store i8 42, ptr %idx.1, align 1
  da analyze -     SrcSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>
    DstSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>

GEP to delinearize:   %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
```

With this change, we will cache the src and dst subscripts in the first call:

```
Src:  store i8 42, ptr %idx.0, align 1 --> Dst:  store i8 42, ptr %idx.0, align 1
  da analyze -     SrcSCEV = {%a,+,(-1 * %k)}<%loop>
    DstSCEV = {%a,+,(-1 * %k)}<%loop>
  Cached Src subscripts
  Cached Dst subscripts
```

In the second call, cache the dst:

```
Src:  store i8 42, ptr %idx.0, align 1 --> Dst:  store i8 42, ptr %idx.1, align 1
  da analyze -     SrcSCEV = {%a,+,(-1 * %k)}<%loop>
    DstSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>
  Cached Dst subscripts
```

and the third call has a cache hit for both the dst and src:

```
Src:  store i8 42, ptr %idx.1, align 1 --> Dst:  store i8 42, ptr %idx.1, align 1
  da analyze -     SrcSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>
    DstSCEV = {(1 + (2 * %k) + %a),+,(-1 * %k)}<%loop>
  Delinearization cache hit for both Src and Dst
```

>From f56dd8217c1be10ba615433f03bae5cb8ab25da2 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer at nvidia.com>
Date: Tue, 21 Oct 2025 01:45:27 -0700
Subject: [PATCH] [DA] Cache delinearization results. NFCI.

An instruction can appear in multiple source-destination dependency
pairs. If this is the case, delinearization is requested and recomputed
for the same instruction again and again. Instead, cache the
delinearization and query the cache first before computing it. I made this
observation while going through debug logs for DA, and wanted to test
whether you like this idea or not before I try to measure whether this
has a compile-time benefit, which is of course the reason to do this.
---
 .../llvm/Analysis/DependenceAnalysis.h        |  6 ++++
 llvm/lib/Analysis/DependenceAnalysis.cpp      | 36 ++++++++++++++++---
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 18a8f8aabb44a..04fa9ad0774bd 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -420,6 +420,12 @@ class DependenceInfo {
   Function *F;
   SmallVector<const SCEVPredicate *, 4> Assumptions;
 
+  /// Cache for delinearized subscripts to avoid recomputation.
+  /// Maps (Instruction, Loop, AccessFn) -> Subscripts
+  DenseMap<std::tuple<Instruction *, Loop *, const SCEV *>,
+           SmallVector<const SCEV *, 4>>
+      DelinearizationCache;
+
   /// Subscript - This private struct represents a pair of subscripts from
   /// a pair of potentially multi-dimensional array references. We use a
   /// vector of them to guide subscript partitioning.
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 805b6820e1e1c..7e413c65a71a6 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3463,11 +3463,37 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
 
   SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts;
 
-  if (!tryDelinearizeFixedSize(Src, Dst, SrcAccessFn, DstAccessFn,
-                               SrcSubscripts, DstSubscripts) &&
-      !tryDelinearizeParametricSize(Src, Dst, SrcAccessFn, DstAccessFn,
-                                    SrcSubscripts, DstSubscripts))
-    return false;
+  // Check cache for both Src and Dst subscripts
+  auto SrcCacheKey = std::make_tuple(Src, SrcLoop, SrcAccessFn);
+  auto DstCacheKey = std::make_tuple(Dst, DstLoop, DstAccessFn);
+  auto SrcCacheIt = DelinearizationCache.find(SrcCacheKey);
+  auto DstCacheIt = DelinearizationCache.find(DstCacheKey);
+  bool SrcCached = (SrcCacheIt != DelinearizationCache.end());
+  bool DstCached = (DstCacheIt != DelinearizationCache.end());
+
+  if (SrcCached && DstCached) {
+    // Both are cached - use cached values and skip delinearization
+    SrcSubscripts = SrcCacheIt->second;
+    DstSubscripts = DstCacheIt->second;
+    LLVM_DEBUG(dbgs() << "  Delinearization cache hit for both Src and Dst\n");
+  } else {
+    // At least one is not cached - need to compute both
+    if (!tryDelinearizeFixedSize(Src, Dst, SrcAccessFn, DstAccessFn,
+                                 SrcSubscripts, DstSubscripts) &&
+        !tryDelinearizeParametricSize(Src, Dst, SrcAccessFn, DstAccessFn,
+                                      SrcSubscripts, DstSubscripts))
+      return false;
+
+    // Cache the results
+    if (!SrcCached) {
+      DelinearizationCache[SrcCacheKey] = SrcSubscripts;
+      LLVM_DEBUG(dbgs() << "  Cached Src subscripts\n");
+    }
+    if (!DstCached) {
+      DelinearizationCache[DstCacheKey] = DstSubscripts;
+      LLVM_DEBUG(dbgs() << "  Cached Dst subscripts\n");
+    }
+  }
 
   assert(isLoopInvariant(SrcBase, SrcLoop) &&
          isLoopInvariant(DstBase, DstLoop) &&