[llvm] [DA] Add batch delinearization support for improved precision (PR #170519)

Wed Dec 3 09:43:14 PST 2025

https://github.com/sebpop created https://github.com/llvm/llvm-project/pull/170519

This patch adds support for batch delinearization in DependenceAnalysis, similar to how Polly processes delinearization. Instead of analyzing pairs of memory accesses independently, this approach:

1. Collects all memory accesses in the function, grouped by base pointer.
2. For each base pointer, collects delinearization terms from ALL accesses.
3. Computes array dimensions once using all available terms.
4. Caches the results for use during pairwise dependence analysis.

This leads to better precision because more terms are available when inferring array dimensions, especially for parametric arrays where dimension information may be spread across multiple accesses.

>From 546777fa8b29add3fa019a5244dff0e7e6766f2d Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop at nvidia.com>
Date: Tue, 2 Dec 2025 13:47:05 -0600
Subject: [PATCH] [DA] Add batch delinearization support for improved precision

This patch adds support for batch delinearization in DependenceAnalysis,
similar to how Polly processes delinearization. Instead of analyzing pairs
of memory accesses independently, this approach:

1. Collects all memory accesses in the function, grouped by base pointer.
2. For each base pointer, collects delinearization terms from ALL accesses.
3. Computes array dimensions once using all available terms.
4. Caches the results for use during pairwise dependence analysis.

This leads to better precision because more terms are available when
inferring array dimensions, especially for parametric arrays where
dimension information may be spread across multiple accesses.
---
 .../llvm/Analysis/DependenceAnalysis.h        |  31 +++
 llvm/lib/Analysis/DDG.cpp                     |   1 +
 llvm/lib/Analysis/DependenceAnalysis.cpp      | 217 +++++++++++++++++-
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |   1 +
 .../lib/Transforms/Scalar/LoopInterchange.cpp |   1 +
 .../Scalar/LoopUnrollAndJamPass.cpp           |   1 +
 llvm/test/Analysis/DDG/basic-loopnest.ll      |   2 -
 .../BatchDelinearization.ll                   | 147 ++++++++++++
 llvm/test/Transforms/LICM/lnicm.ll            |   3 -
 .../loop-interchange-optimization-remarks.ll  |  29 +--
 .../LoopInterchange/outer-dependency-lte.ll   |   2 +-
 11 files changed, 409 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll

diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 6dec24fc9f104..a07ee7edf5c83 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -355,6 +355,14 @@ class DependenceInfo {
 
   Function *getFunction() const { return F; }
 
+  /// precomputeDelinearization - Pre-compute delinearization information for
+  /// all memory accesses in the function. This collects all load/store
+  /// instructions, groups them by base pointer, and computes array dimensions
+  /// using terms from all accesses to each base pointer.
+  /// This approach (similar to Polly's) can provide better precision than
+  /// pairwise delinearization.
+  LLVM_ABI void precomputeDelinearization();
+
 private:
   AAResults *AA;
   ScalarEvolution *SE;
@@ -388,6 +396,24 @@ class DependenceInfo {
     unsigned char DirSet;
   };
 
+  /// DelinearizationCache - Cache for batch delinearization results.
+  /// When analyzing multiple memory accesses to the same base pointer,
+  /// caching allows us to compute array dimensions once using all accesses,
+  /// leading to better precision (similar to Polly's approach).
+  struct DelinearizationCache {
+    /// Map from base pointer to computed array dimension sizes.
+    DenseMap<const SCEVUnknown *, SmallVector<const SCEV *, 4>> ArraySizes;
+    /// Map from instruction to pre-computed subscripts.
+    DenseMap<const Instruction *, SmallVector<const SCEV *, 4>> Subscripts;
+    /// Element size for the array (used for validation).
+    DenseMap<const SCEVUnknown *, const SCEV *> ElementSizes;
+    /// Flag indicating whether the cache has been populated.
+    bool IsPopulated = false;
+  };
+
+  /// Cache for batch delinearization results.
+  DelinearizationCache DelinCache;
+
   /// Returns true if two loops have the Same iteration Space and Depth. To be
   /// more specific, two loops have SameSD if they are in the same nesting
   /// depth and have the same backedge count. SameSD stands for Same iteration
@@ -773,6 +799,11 @@ class DependenceInfo {
                                SmallVectorImpl<const SCEV *> &SrcSubscripts,
                                SmallVectorImpl<const SCEV *> &DstSubscripts);
 
+  /// populateDelinearizationCache - Collects all load/store instructions in
+  /// the function, groups them by base pointer, and computes array dimensions
+  /// for each base pointer using terms from all accesses.
+  void populateDelinearizationCache();
+
   /// checkSubscript - Helper function for checkSrcSubscript and
   /// checkDstSubscript to avoid duplicate code
   bool checkSubscript(const SCEV *Expr, const Loop *LoopNest,
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 0907a7fb021fc..d6eda7cfece7a 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -309,6 +309,7 @@ DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                                      LoopStandardAnalysisResults &AR) {
   Function *F = L.getHeader()->getParent();
   DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+  DI.precomputeDelinearization();
   return std::make_unique<DataDependenceGraph>(L, AR.LI, DI);
 }
 AnalysisKey DDGAnalysis::Key;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index fe07b7edb6713..d9fa257c19d8b 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3255,6 +3255,21 @@ bool DependenceInfo::tryDelinearizeFixedSize(
            "expected src and dst scev unknowns to be equal");
   });
 
+  // Try to use cached subscripts from batch delinearization.
+  if (DelinCache.IsPopulated) {
+    auto SrcSubIt = DelinCache.Subscripts.find(Src);
+    auto DstSubIt = DelinCache.Subscripts.find(Dst);
+    if (SrcSubIt != DelinCache.Subscripts.end() &&
+        DstSubIt != DelinCache.Subscripts.end() &&
+        SrcSubIt->second.size() >= 2 && DstSubIt->second.size() >= 2 &&
+        SrcSubIt->second.size() == DstSubIt->second.size()) {
+      SrcSubscripts = SrcSubIt->second;
+      DstSubscripts = DstSubIt->second;
+      LLVM_DEBUG(dbgs() << "Using cached fixed-size delinearization results\n");
+      return true;
+    }
+  }
+
   const SCEV *ElemSize = SE->getElementSize(Src);
   assert(ElemSize == SE->getElementSize(Dst) && "Different element sizes");
   SmallVector<const SCEV *, 4> SrcSizes, DstSizes;
@@ -3328,16 +3343,80 @@ bool DependenceInfo::tryDelinearizeParametricSize(
   if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
     return false;
 
+  SmallVector<const SCEV *, 4> Sizes;
+
+  // Try to use cached results from batch delinearization.
+  // This provides better precision by using terms from all accesses.
+  if (DelinCache.IsPopulated) {
+    auto SizesIt = DelinCache.ArraySizes.find(SrcBase);
+    if (SizesIt != DelinCache.ArraySizes.end()) {
+      // Check element size compatibility.
+      auto ElemSizeIt = DelinCache.ElementSizes.find(SrcBase);
+      if (ElemSizeIt != DelinCache.ElementSizes.end() &&
+          ElemSizeIt->second == ElementSize) {
+        Sizes = SizesIt->second;
+
+        // Try to use pre-computed subscripts if available.
+        auto SrcSubIt = DelinCache.Subscripts.find(Src);
+        auto DstSubIt = DelinCache.Subscripts.find(Dst);
+        if (SrcSubIt != DelinCache.Subscripts.end() &&
+            DstSubIt != DelinCache.Subscripts.end()) {
+          SrcSubscripts = SrcSubIt->second;
+          DstSubscripts = DstSubIt->second;
+
+          if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 &&
+              SrcSubscripts.size() == DstSubscripts.size()) {
+            LLVM_DEBUG(dbgs() << "Using cached delinearization results\n");
+
+            // Validate the cached subscripts.
+            if (!DisableDelinearizationChecks)
+              if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts,
+                                                 SrcPtr) ||
+                  !validateDelinearizationResult(*SE, Sizes, DstSubscripts,
+                                                 DstPtr))
+                return false;
+
+            return true;
+          }
+        }
+
+        // Cache had sizes but not pre-computed subscripts for these
+        // instructions, or pre-computed subscripts failed validation.
+        // Compute subscripts using cached sizes.
+        LLVM_DEBUG(dbgs() << "Using cached array sizes for delinearization\n");
+        SrcSubscripts.clear();
+        DstSubscripts.clear();
+        computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes);
+        computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes);
+
+        if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 &&
+            SrcSubscripts.size() == DstSubscripts.size()) {
+          if (!DisableDelinearizationChecks)
+            if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts,
+                                               SrcPtr) ||
+                !validateDelinearizationResult(*SE, Sizes, DstSubscripts,
+                                               DstPtr))
+              return false;
+
+          return true;
+        }
+      }
+    }
+  }
+
+  // Fall back to pairwise delinearization.
   // First step: collect parametric terms in both array references.
   SmallVector<const SCEV *, 4> Terms;
   collectParametricTerms(*SE, SrcAR, Terms);
   collectParametricTerms(*SE, DstAR, Terms);
 
   // Second step: find subscript sizes.
-  SmallVector<const SCEV *, 4> Sizes;
+  Sizes.clear();
   findArrayDimensions(*SE, Terms, Sizes, ElementSize);
 
   // Third step: compute the access functions for each subscript.
+  SrcSubscripts.clear();
+  DstSubscripts.clear();
   computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes);
   computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes);
 
@@ -3687,3 +3766,139 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   return std::make_unique<FullDependence>(std::move(Result));
 }
+
+//===----------------------------------------------------------------------===//
+// Batch Delinearization Support
+//===----------------------------------------------------------------------===//
+
+void DependenceInfo::precomputeDelinearization() {
+  populateDelinearizationCache();
+}
+
+void DependenceInfo::populateDelinearizationCache() {
+  if (DelinCache.IsPopulated)
+    return;
+
+  DelinCache.IsPopulated = true;
+
+  // Step 1: Collect all memory accesses grouped by base pointer.
+  // Map from base pointer to list of (Instruction, AccessFunction) pairs.
+  DenseMap<const SCEVUnknown *,
+           SmallVector<std::pair<Instruction *, const SCEV *>, 4>>
+      AccessesByBase;
+
+  for (Instruction &I : instructions(*F)) {
+    if (!isLoadOrStore(&I))
+      continue;
+
+    Value *Ptr = getLoadStorePointerOperand(&I);
+    Loop *L = LI->getLoopFor(I.getParent());
+    const SCEV *AccessFn = SE->getSCEVAtScope(Ptr, L);
+    const SCEVUnknown *Base =
+        dyn_cast<SCEVUnknown>(SE->getPointerBase(AccessFn));
+
+    if (!Base)
+      continue;
+
+    // Only consider accesses where the base is loop invariant.
+    if (L && !SE->isLoopInvariant(Base, L))
+      continue;
+
+    AccessesByBase[Base].push_back({&I, AccessFn});
+  }
+
+  // Step 2: For each base pointer, collect terms from ALL accesses and
+  // compute array dimensions once.
+  for (auto &Entry : AccessesByBase) {
+    const SCEVUnknown *Base = Entry.first;
+    auto &Accesses = Entry.second;
+
+    // Skip if there's only one access - no benefit from batch processing.
+    if (Accesses.size() < 2)
+      continue;
+
+    // Determine element size - use the smallest among all accesses.
+    const SCEV *ElementSize = nullptr;
+    for (auto &Access : Accesses) {
+      const SCEV *EltSize = SE->getElementSize(Access.first);
+      if (!ElementSize)
+        ElementSize = EltSize;
+      else if (SE->isKnownPredicate(ICmpInst::ICMP_ULT, EltSize, ElementSize))
+        ElementSize = EltSize;
+    }
+
+    if (!ElementSize)
+      continue;
+
+    DelinCache.ElementSizes[Base] = ElementSize;
+
+    // Collect parametric terms from all accesses to this base.
+    SmallVector<const SCEV *, 8> Terms;
+    for (auto &Access : Accesses) {
+      const SCEV *AccessFn = Access.second;
+      const SCEV *OffsetSCEV = SE->getMinusSCEV(AccessFn, Base);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffsetSCEV);
+      if (AR && AR->isAffine())
+        collectParametricTerms(*SE, AR, Terms);
+    }
+
+    // Find array dimensions using all collected terms.
+    SmallVector<const SCEV *, 4> Sizes;
+    findArrayDimensions(*SE, Terms, Sizes, ElementSize);
+
+    // Skip if we couldn't determine dimensions.
+    if (Sizes.size() < 2)
+      continue;
+
+    DelinCache.ArraySizes[Base] = Sizes;
+
+    // Pre-compute subscripts for each access using parametric sizes.
+    for (auto &Access : Accesses) {
+      Instruction *Inst = Access.first;
+      const SCEV *AccessFn = Access.second;
+      const SCEV *OffsetSCEV = SE->getMinusSCEV(AccessFn, Base);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffsetSCEV);
+
+      if (!AR || !AR->isAffine())
+        continue;
+
+      SmallVector<const SCEV *, 4> Subscripts;
+      computeAccessFunctions(*SE, AR, Subscripts, Sizes);
+
+      if (Subscripts.size() >= 2)
+        DelinCache.Subscripts[Inst] = std::move(Subscripts);
+    }
+  }
+
+  // Step 3: Try fixed-size array delinearization for accesses not yet cached.
+  // This handles arrays with known compile-time dimensions.
+  for (auto &Entry : AccessesByBase) {
+    auto &Accesses = Entry.second;
+
+    for (auto &Access : Accesses) {
+      Instruction *Inst = Access.first;
+
+      // Skip if already cached from parametric delinearization.
+      if (DelinCache.Subscripts.count(Inst))
+        continue;
+
+      const SCEV *AccessFn = Access.second;
+      const SCEV *ElemSize = SE->getElementSize(Inst);
+      SmallVector<const SCEV *, 4> Subscripts, Sizes;
+
+      if (delinearizeFixedSizeArray(*SE, SE->removePointerBase(AccessFn),
+                                    Subscripts, Sizes, ElemSize) &&
+          Subscripts.size() >= 2) {
+        DelinCache.Subscripts[Inst] = std::move(Subscripts);
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Batch delinearization cache populated:\n";
+    dbgs() << "  Base pointers with cached dimensions: "
+           << DelinCache.ArraySizes.size() << "\n";
+    dbgs() << "  Instructions with cached subscripts: "
+           << DelinCache.Subscripts.size() << "\n";
+  });
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 9ffa602416b05..521c1fadf4561 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -2142,6 +2142,7 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &DI = AM.getResult<DependenceAnalysis>(F);
+  DI.precomputeDelinearization();
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 330b4abb9942f..c3e44b7ecd701 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -2140,6 +2140,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
   });
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  DI.precomputeDelinearization();
   if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
     return PreservedAnalyses::all();
   U.markLoopNestChanged(true);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 4fe74c7c3bbcd..a3e00826c31c5 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -458,6 +458,7 @@ PreservedAnalyses LoopUnrollAndJamPass::run(LoopNest &LN,
   Function &F = *LN.getParent();
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  DI.precomputeDelinearization();
   OptimizationRemarkEmitter ORE(&F);
 
   bool AnyLoopRemoved = false;
diff --git a/llvm/test/Analysis/DDG/basic-loopnest.ll b/llvm/test/Analysis/DDG/basic-loopnest.ll
index 75efff570048b..61003298438f6 100644
--- a/llvm/test/Analysis/DDG/basic-loopnest.ll
+++ b/llvm/test/Analysis/DDG/basic-loopnest.ll
@@ -1,7 +1,5 @@
 ; RUN: opt < %s -disable-output "-passes=print<ddg>" 2>&1 | FileCheck %s
 
-; XFAIL: *
-; At the moment, DependenceAnalysis cannot infer `n` to be positive.
 
 
 ; CHECK-LABEL: 'DDG' for loop 'test1.for.cond1.preheader':
diff --git a/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll
new file mode 100644
index 0000000000000..27ef4c6db3f27
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
+; RUN: | FileCheck %s
+
+; Test case for batch delinearization. When multiple accesses to the same
+; base pointer are analyzed together, terms from all accesses are collected
+; to determine array dimensions, leading to better precision.
+;
+; This test has three accesses to array A:
+;   A[i*m + j]  (in the write)
+;   A[i*m + j]  (in the read)
+;   A[k*m + l]  (third access that provides additional context)
+;
+; The third access helps provide more terms for delinearization,
+; which can improve precision when analyzing the first two accesses.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Three accesses to the same 2D array A[n][m].
+; Batch delinearization collects terms from all accesses.
+define void @batch_delin_test(i64 %n, i64 %m, ptr nocapture %A) {
+; CHECK-LABEL: 'batch_delin_test'
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load1 = load double, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - anti [* *|<]!
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [<> *]!
+; CHECK-NEXT:  Src: store double %add, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:  Src: store double %add, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - flow [<> *]!
+; CHECK-NEXT:  Src: %load2 = load double, ptr %arrayidx2, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+;
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  %cmp2 = icmp sgt i64 %m, 0
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %loop.i.preheader, label %exit
+
+loop.i.preheader:
+  br label %loop.i
+
+loop.i:
+  %i = phi i64 [ 0, %loop.i.preheader ], [ %i.next, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i ], [ %j.next, %loop.j ]
+  ; Compute linear index: i*m + j
+  %mul1 = mul nsw i64 %i, %m
+  %idx1 = add nsw i64 %mul1, %j
+  %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1
+  ; First access: load A[i*m + j]
+  %load1 = load double, ptr %arrayidx1, align 8
+  %add = fadd double %load1, 1.0
+  ; Second access: store A[i*m + j]
+  store double %add, ptr %arrayidx1, align 8
+  ; Third access at a different index: load A[(i+1)*m + j]
+  ; This provides additional terms for delinearization.
+  %i_plus_1 = add nsw i64 %i, 1
+  %mul2 = mul nsw i64 %i_plus_1, %m
+  %idx2 = add nsw i64 %mul2, %j
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2
+  %load2 = load double, ptr %arrayidx2, align 8
+  %j.next = add nuw nsw i64 %j, 1
+  %j.cond = icmp slt i64 %j.next, %m
+  br i1 %j.cond, label %loop.j, label %loop.i.latch
+
+loop.i.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %i.cond = icmp slt i64 %i.next, %n
+  br i1 %i.cond, label %loop.i, label %exit
+
+exit:
+  ret void
+}
+
+; Test with parametric sizes where batch delinearization helps.
+; Two separate loop nests accessing the same array.
+define void @batch_delin_two_nests(i64 %n, i64 %m, ptr nocapture %A) {
+; CHECK-LABEL: 'batch_delin_two_nests'
+; CHECK-NEXT:  Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: store double 1.000000e+00, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:  Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - flow [|<]!
+; CHECK-NEXT:  Src: %load = load double, ptr %arrayidx2, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+;
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  %cmp2 = icmp sgt i64 %m, 0
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %nest1.i.preheader, label %exit
+
+; First loop nest: stores to A[i*m + j]
+nest1.i.preheader:
+  br label %nest1.i
+
+nest1.i:
+  %i1 = phi i64 [ 0, %nest1.i.preheader ], [ %i1.next, %nest1.i.latch ]
+  br label %nest1.j
+
+nest1.j:
+  %j1 = phi i64 [ 0, %nest1.i ], [ %j1.next, %nest1.j ]
+  %mul1 = mul nsw i64 %i1, %m
+  %idx1 = add nsw i64 %mul1, %j1
+  %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1
+  store double 1.0, ptr %arrayidx1, align 8
+  %j1.next = add nuw nsw i64 %j1, 1
+  %j1.cond = icmp slt i64 %j1.next, %m
+  br i1 %j1.cond, label %nest1.j, label %nest1.i.latch
+
+nest1.i.latch:
+  %i1.next = add nuw nsw i64 %i1, 1
+  %i1.cond = icmp slt i64 %i1.next, %n
+  br i1 %i1.cond, label %nest1.i, label %nest2.i.preheader
+
+; Second loop nest: reads from A[k*m + l]
+nest2.i.preheader:
+  br label %nest2.i
+
+nest2.i:
+  %i2 = phi i64 [ 0, %nest2.i.preheader ], [ %i2.next, %nest2.i.latch ]
+  br label %nest2.j
+
+nest2.j:
+  %j2 = phi i64 [ 0, %nest2.i ], [ %j2.next, %nest2.j ]
+  %mul2 = mul nsw i64 %i2, %m
+  %idx2 = add nsw i64 %mul2, %j2
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2
+  %load = load double, ptr %arrayidx2, align 8
+  %j2.next = add nuw nsw i64 %j2, 1
+  %j2.cond = icmp slt i64 %j2.next, %m
+  br i1 %j2.cond, label %nest2.j, label %nest2.i.latch
+
+nest2.i.latch:
+  %i2.next = add nuw nsw i64 %i2, 1
+  %i2.cond = icmp slt i64 %i2.next, %n
+  br i1 %i2.cond, label %nest2.i, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LICM/lnicm.ll b/llvm/test/Transforms/LICM/lnicm.ll
index e331ab7d39e83..814f964666305 100644
--- a/llvm/test/Transforms/LICM/lnicm.ll
+++ b/llvm/test/Transforms/LICM/lnicm.ll
@@ -3,9 +3,6 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LNICM
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LICM
 
-; XFAIL: *
-; Loop interchange currently fails due to a failure in dependence analysis.
-
 ; This test represents the following function:
 ; void test(int n, int m, int x[m][n], int y[n], int *z) {
 ;   for (int k = 0; k < n; k++) {
diff --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
index 14836ba73433d..a5cd1cb924e84 100644
--- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -58,21 +58,17 @@ for.end19:
   ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly.
+; The interchange is still not profitable, but for a different reason.
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        test01
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Computed dependence info, invoking the transform.
-; CHECK-NEXT: ...
 
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        test01
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Analysis
 ; DELIN-NEXT: Pass:            loop-interchange
@@ -134,21 +130,17 @@ define void @test02(i32 %k, i32 %N) {
    ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly
+; and the loop can be interchanged (same behavior as DELIN).
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        test02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Computed dependence info, invoking the transform.
-; CHECK-NEXT: ...
 
-; CHECK: --- !Missed
+; CHECK: --- !Passed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            Interchanged
 ; CHECK-NEXT: Function:        test02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Analysis
 ; DELIN-NEXT: Pass:            loop-interchange
@@ -285,13 +277,12 @@ for.end17:
   ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly.
+; The real reason for not interchanging is that loops are not tightly nested.
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            NotTightlyNested
 ; CHECK-NEXT: Function:        test04
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Missed
 ; DELIN-NEXT: Pass:            loop-interchange
diff --git a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
index 4aba99f35678e..c8e79dc169b1d 100644
--- a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
+++ b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        f
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
+; CHECK-NEXT:   - String:          Cannot interchange loops due to dependences.
 ; CHECK-NEXT: ...