[llvm] [LoopInterchange] Add metadata to control loop-interchange (PR #127474)

Ryotaro Kasuga via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 7 09:48:50 PST 2025


https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/127474

>From 1b11ebe3fa04e07b6cb6ceeaeb50d44fac7aa983 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Mon, 17 Feb 2025 11:23:46 +0000
Subject: [PATCH 1/2] [LoopInterchange] Add metadata to control
 loop-interchange

This patch adds metadata to enable/disable the loop-interchange for a
loop nest. This is a prelude to introduce a new pragma directive for
loop-interchange, like other loop optimizations (unroll, vectorize,
distribute, etc.) have.
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp |  76 ++++
 .../Transforms/LoopInterchange/metadata.ll    | 325 ++++++++++++++++++
 2 files changed, 401 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopInterchange/metadata.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 967be109a7ba6..97b2d0c494e4c 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -51,6 +51,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
 
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopInterchangeFollowupAll =
+    "llvm.loop.interchange.followup_all";
+static const char *const LLVMLoopInterchangeFollowupOuter =
+    "llvm.loop.interchange.followup_outer";
+static const char *const LLVMLoopInterchangeFollowupInner =
+    "llvm.loop.interchange.followup_inner";
+/// @}
+
 STATISTIC(LoopsInterchanged, "Number of loops interchanged");
 
 static cl::opt<int> LoopInterchangeCostThreshold(
@@ -65,6 +75,14 @@ static cl::opt<unsigned int> MaxMemInstrCount(
         "in the dependency matrix. Higher value may lead to more interchanges "
         "at the cost of compile-time"));
 
+// Whether to apply by default.
+// TODO: Once this pass is enabled by default, remove this option and use the
+// value of PipelineTuningOptions.
+static cl::opt<bool> OnlyWhenForced(
+    "loop-interchange-only-when-forced", cl::init(false), cl::ReallyHidden,
+    cl::desc(
+        "Apply interchanges only when explicitly specified metadata exists"));
+
 namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
@@ -297,6 +315,16 @@ static bool isComputableLoopNest(ScalarEvolution *SE,
   return true;
 }
 
+static std::optional<bool> findMetadata(Loop *L) {
+  auto Value = findStringMetadataForLoop(L, "llvm.loop.interchange.enable");
+  if (!Value)
+    return std::nullopt;
+
+  const MDOperand *Op = *Value;
+  assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+  return mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+}
+
 namespace {
 
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -504,6 +532,10 @@ struct LoopInterchange {
         CostMap[LoopCosts[i].first] = i;
       }
     }
+
+    if (OnlyWhenForced)
+      return processEnabledLoop(LoopList, DependencyMatrix, CostMap);
+
     // We try to achieve the globally optimal memory access for the loopnest,
     // and do interchange based on a bubble-sort fasion. We start from
     // the innermost loop, move it outwards to the best possible position
@@ -532,6 +564,8 @@ struct LoopInterchange {
     Loop *InnerLoop = LoopList[InnerLoopId];
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
+    if (findMetadata(OuterLoop) == false || findMetadata(InnerLoop) == false)
+      return false;
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
@@ -569,6 +603,48 @@ struct LoopInterchange {
 
     return true;
   }
+
+  bool processEnabledLoop(SmallVectorImpl<Loop *> &LoopList,
+                          std::vector<std::vector<char>> &DependencyMatrix,
+                          const DenseMap<const Loop *, unsigned> &CostMap) {
+    bool Changed = false;
+    for (unsigned InnerLoopId = LoopList.size() - 1; InnerLoopId > 0;
+         InnerLoopId--) {
+      unsigned OuterLoopId = InnerLoopId - 1;
+      if (findMetadata(LoopList[OuterLoopId]) != true)
+        continue;
+
+      MDNode *MDOrigLoopID = LoopList[OuterLoopId]->getLoopID();
+      bool Interchanged =
+          processLoop(LoopList[InnerLoopId], LoopList[OuterLoopId], InnerLoopId,
+                      OuterLoopId, DependencyMatrix, CostMap);
+
+      // TODO: Consolidate the duplicate code in `processLoopList`.
+      if (Interchanged) {
+        std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
+        // Update the DependencyMatrix
+        interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
+
+        LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
+                   printDepMatrix(DependencyMatrix));
+      }
+
+      std::optional<MDNode *> MDOuterLoopID =
+          makeFollowupLoopID(MDOrigLoopID, {LLVMLoopInterchangeFollowupAll,
+                                            LLVMLoopInterchangeFollowupOuter});
+      if (MDOuterLoopID)
+        LoopList[OuterLoopId]->setLoopID(*MDOuterLoopID);
+
+      std::optional<MDNode *> MDInnerLoopID =
+          makeFollowupLoopID(MDOrigLoopID, {LLVMLoopInterchangeFollowupAll,
+                                            LLVMLoopInterchangeFollowupInner});
+      if (MDInnerLoopID)
+        LoopList[InnerLoopId]->setLoopID(*MDInnerLoopID);
+
+      Changed |= Interchanged;
+    }
+    return Changed;
+  }
 };
 
 } // end anonymous namespace
diff --git a/llvm/test/Transforms/LoopInterchange/metadata.ll b/llvm/test/Transforms/LoopInterchange/metadata.ll
new file mode 100644
index 0000000000000..9838abb905a7e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/metadata.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-ON
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=1 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-OFF
+
+; Test if the metadata works correctly. The code is as follows:
+; 
+; #define N 4
+; int a[N][N][N][N];
+; int b[N][N][N][N];
+; void f() {
+;   for (int i = 0; i < N; i++)
+;     for (int j = 0; j < N; j++)
+;       #pragma clang loop interchange(enable or disable)
+;       for (int k = 0; k < N; k++)
+;         for (int l = 0; l < N; l++)
+;           a[l][k][j][i] += b[l][k][j][i];
+; }
+;
+; In the functions explicit_on and explicit_off, the values enable and disable
+; are specified in the pragma, respectively. If the
+; `loop-interchange-only-when-forced` is set to 0, the loop-interchange will be
+; performed to the loop nest unless it is explicitly disabled. If the value is
+; set to 1, the loop-interchange will be performed to the loop nest only when
+; it is explicitly enabled.
+
+ at a = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
+ at b = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
+
+define void @explicit_on() {
+; DEFAULT-ON-LABEL: define void @explicit_on() {
+; DEFAULT-ON-NEXT:  [[ENTRY:.*:]]
+; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND1_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12_SPLIT1:.*]]
+; DEFAULT-ON:       [[FOR_COND5_PREHEADER_PREHEADER:.*]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP3]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-ON-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP7_SPLIT:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP7:.*]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP3]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP7_SPLIT]]:
+; DEFAULT-ON-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV57]], 1
+; DEFAULT-ON-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 2
+; DEFAULT-ON-NEXT:    br i1 [[TMP1]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP11_SPLIT:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP7]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP11_SPLIT]]:
+; DEFAULT-ON-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV53]], 1
+; DEFAULT-ON-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 2
+; DEFAULT-ON-NEXT:    br i1 [[TMP3]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-ON:       [[FOR_BODY12:.*]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP6:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_BODY12_SPLIT1]]:
+; DEFAULT-ON-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-ON-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
+; DEFAULT-ON-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP11]]
+; DEFAULT-ON:       [[FOR_BODY12_SPLIT]]:
+; DEFAULT-ON-NEXT:    [[TMP6]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-ON-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 2
+; DEFAULT-ON-NEXT:    br i1 [[TMP7]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-ON:       [[FOR_COND9_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV53]] = phi i64 [ [[TMP2]], %[[FOR_COND_CLEANUP11_SPLIT]] ], [ 0, %[[FOR_COND9_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER_PREHEADER]]
+; DEFAULT-ON:       [[FOR_BODY12_PREHEADER]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12]]
+; DEFAULT-ON:       [[FOR_COND5_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV57]] = phi i64 [ [[TMP0]], %[[FOR_COND_CLEANUP7_SPLIT]] ], [ 0, %[[FOR_COND5_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER_PREHEADER]]
+; DEFAULT-ON:       [[FOR_COND9_PREHEADER_PREHEADER]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP]]:
+; DEFAULT-ON-NEXT:    ret void
+;
+; DEFAULT-OFF-LABEL: define void @explicit_on() {
+; DEFAULT-OFF-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-OFF:       [[FOR_COND1_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP3]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP7:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP3]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-OFF:       [[FOR_BODY12:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP2:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER:.*]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
+; DEFAULT-OFF:       [[FOR_BODY12_SPLIT1:.*]]:
+; DEFAULT-OFF-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-OFF-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-OFF-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; DEFAULT-OFF-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND_CLEANUP11]]
+; DEFAULT-OFF:       [[FOR_BODY12_SPLIT]]:
+; DEFAULT-OFF-NEXT:    [[TMP2]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-OFF-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[TMP3]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP7]]
+; DEFAULT-OFF:       [[FOR_COND9_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV53]] = phi i64 [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ], [ 0, %[[FOR_COND9_PREHEADER_PREHEADER]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12_SPLIT1]]
+; DEFAULT-OFF:       [[FOR_BODY12_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12]]
+; DEFAULT-OFF:       [[FOR_COND5_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV57]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12_PREHEADER]]
+; DEFAULT-OFF:       [[FOR_COND9_PREHEADER_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP]]:
+; DEFAULT-OFF-NEXT:    ret void
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv61 = phi i64 [ 0, %entry ], [ %indvars.iv.next62, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup3:
+  %indvars.iv.next62 = add nuw nsw i64 %indvars.iv61, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next62, 2
+  br i1 %exitcond64, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond.cleanup7:
+  %indvars.iv.next58 = add nuw nsw i64 %indvars.iv57, 1
+  %exitcond60 = icmp ne i64 %indvars.iv.next58, 2
+  br i1 %exitcond60, label %for.cond5.preheader, label %for.cond.cleanup3
+
+for.cond.cleanup11:
+  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+  %exitcond56 = icmp ne i64 %indvars.iv.next54, 2
+  br i1 %exitcond56, label %for.cond9.preheader, label %for.cond.cleanup7, !llvm.loop !0
+
+for.body12:
+  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+  %arrayidx18 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
+  %0 = load i32, ptr %arrayidx18, align 4
+  %arrayidx26 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
+  %1 = load i32, ptr %arrayidx26, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %arrayidx26, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2
+  br i1 %exitcond, label %for.body12, label %for.cond.cleanup11
+
+for.cond9.preheader:
+  %indvars.iv53 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next54, %for.cond.cleanup11 ]
+  br label %for.body12
+
+for.cond5.preheader:
+  %indvars.iv57 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next58, %for.cond.cleanup7 ]
+  br label %for.cond9.preheader
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @explicit_off() {
+; DEFAULT-ON-LABEL: define void @explicit_off() {
+; DEFAULT-ON-NEXT:  [[ENTRY:.*:]]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND1_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND5_PREHEADER_PREHEADER]]:
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP3]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-ON-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP7:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP7]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-ON-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-ON-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER]], label %[[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP2:![0-9]+]]
+; DEFAULT-ON:       [[FOR_BODY12:.*]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_COND9_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY12]] ]
+; DEFAULT-ON-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-ON-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; DEFAULT-ON-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-ON-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
+; DEFAULT-ON-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP11]]
+; DEFAULT-ON:       [[FOR_COND9_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV53]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12]]
+; DEFAULT-ON:       [[FOR_COND5_PREHEADER]]:
+; DEFAULT-ON-NEXT:    [[INDVARS_IV57]] = phi i64 [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ], [ 0, %[[FOR_COND5_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER_PREHEADER]]
+; DEFAULT-ON:       [[FOR_COND_CLEANUP]]:
+; DEFAULT-ON-NEXT:    ret void
+;
+; DEFAULT-OFF-LABEL: define void @explicit_off() {
+; DEFAULT-OFF-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-OFF:       [[FOR_COND1_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP3]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP7:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP3]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_COND_CLEANUP7]], !llvm.loop [[LOOP2:![0-9]+]]
+; DEFAULT-OFF:       [[FOR_BODY12:.*]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_COND9_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY12]] ]
+; DEFAULT-OFF-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-OFF-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-OFF-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; DEFAULT-OFF-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-OFF-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
+; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP11]]
+; DEFAULT-OFF:       [[FOR_COND9_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV53]] = phi i64 [ 0, %[[FOR_COND5_PREHEADER]] ], [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12]]
+; DEFAULT-OFF:       [[FOR_COND5_PREHEADER]]:
+; DEFAULT-OFF-NEXT:    [[INDVARS_IV57]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ]
+; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER]]
+; DEFAULT-OFF:       [[FOR_COND_CLEANUP]]:
+; DEFAULT-OFF-NEXT:    ret void
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv61 = phi i64 [ 0, %entry ], [ %indvars.iv.next62, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup3:
+  %indvars.iv.next62 = add nuw nsw i64 %indvars.iv61, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next62, 2
+  br i1 %exitcond64, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond.cleanup7:
+  %indvars.iv.next58 = add nuw nsw i64 %indvars.iv57, 1
+  %exitcond60 = icmp ne i64 %indvars.iv.next58, 2
+  br i1 %exitcond60, label %for.cond5.preheader, label %for.cond.cleanup3
+
+for.cond.cleanup11:
+  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+  %exitcond56 = icmp ne i64 %indvars.iv.next54, 2
+  br i1 %exitcond56, label %for.cond9.preheader, label %for.cond.cleanup7, !llvm.loop !2
+
+for.body12:
+  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+  %arrayidx18 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
+  %0 = load i32, ptr %arrayidx18, align 4
+  %arrayidx26 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
+  %1 = load i32, ptr %arrayidx26, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %arrayidx26, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2
+  br i1 %exitcond, label %for.body12, label %for.cond.cleanup11
+
+for.cond9.preheader:
+  %indvars.iv53 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next54, %for.cond.cleanup11 ]
+  br label %for.body12
+
+for.cond5.preheader:
+  %indvars.iv57 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next58, %for.cond.cleanup7 ]
+  br label %for.cond9.preheader
+
+for.cond.cleanup:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interchange.enable", i1 true}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.interchange.enable", i1 false}
+;.
+; DEFAULT-ON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; DEFAULT-ON: [[META1]] = !{!"llvm.loop.interchange.enable", i1 true}
+; DEFAULT-ON: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; DEFAULT-ON: [[META3]] = !{!"llvm.loop.interchange.enable", i1 false}
+;.
+; DEFAULT-OFF: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; DEFAULT-OFF: [[META1]] = !{!"llvm.loop.interchange.enable", i1 true}
+; DEFAULT-OFF: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
+; DEFAULT-OFF: [[META3]] = !{!"llvm.loop.interchange.enable", i1 false}
+;.

>From 2418ad8fbc3103a5263f08002f434ef102970bab Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Mon, 3 Mar 2025 11:47:30 +0000
Subject: [PATCH 2/2] Fix metadata to express the application order explicitly.

---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 166 +++++--
 .../LoopInterchange/metadata-disable.ll       | 109 +++++
 .../LoopInterchange/metadata-interruption.ll  |  94 ++++
 .../Transforms/LoopInterchange/metadata.ll    | 404 ++++--------------
 4 files changed, 434 insertions(+), 339 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/metadata-disable.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/metadata-interruption.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 97b2d0c494e4c..baeeb3cb598ec 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -55,6 +55,8 @@ using namespace llvm;
 /// Metadata attribute names
 static const char *const LLVMLoopInterchangeFollowupAll =
     "llvm.loop.interchange.followup_all";
+static const char *const LLVMLoopInterchangeFollowupNextOuter =
+    "llvm.loop.interchange.followup_next_outer";
 static const char *const LLVMLoopInterchangeFollowupOuter =
     "llvm.loop.interchange.followup_outer";
 static const char *const LLVMLoopInterchangeFollowupInner =
@@ -533,6 +535,8 @@ struct LoopInterchange {
       }
     }
 
+    // If OnlyWhenForced is true, only process loops for which interchange is
+    // explicitly enabled.
     if (OnlyWhenForced)
       return processEnabledLoop(LoopList, DependencyMatrix, CostMap);
 
@@ -564,8 +568,10 @@ struct LoopInterchange {
     Loop *InnerLoop = LoopList[InnerLoopId];
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
-    if (findMetadata(OuterLoop) == false || findMetadata(InnerLoop) == false)
+    if (findMetadata(OuterLoop) == false || findMetadata(InnerLoop) == false) {
+      LLVM_DEBUG(dbgs() << "Not interchanging loops. It is disabled.\n");
       return false;
+    }
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
@@ -608,41 +614,145 @@ struct LoopInterchange {
                           std::vector<std::vector<char>> &DependencyMatrix,
                           const DenseMap<const Loop *, unsigned> &CostMap) {
     bool Changed = false;
-    for (unsigned InnerLoopId = LoopList.size() - 1; InnerLoopId > 0;
-         InnerLoopId--) {
-      unsigned OuterLoopId = InnerLoopId - 1;
-      if (findMetadata(LoopList[OuterLoopId]) != true)
-        continue;
 
-      MDNode *MDOrigLoopID = LoopList[OuterLoopId]->getLoopID();
-      bool Interchanged =
-          processLoop(LoopList[InnerLoopId], LoopList[OuterLoopId], InnerLoopId,
-                      OuterLoopId, DependencyMatrix, CostMap);
-
-      // TODO: Consolidate the duplicate code in `processLoopList`.
-      if (Interchanged) {
-        std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
-        // Update the DependencyMatrix
-        interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
+    // Manage the index so that LoopList[Loop2Index[L]] == L for each loop L.
+    DenseMap<Loop *, unsigned> Loop2Index;
+    for (unsigned I = 0; I != LoopList.size(); I++)
+      Loop2Index[LoopList[I]] = I;
+
+    // Hold outer loops to be exchanged (i.e., loops that have
+    // "llvm.loop.interchange.enable" is true), in the current nest order.
+    SmallVector<Loop *, 4> Worklist;
+
+    // Helper funciton to try to add a new loop into the Worklist. Return false
+    // if there is a duplicate in the loop to be interchanged.
+    auto AddLoopIfEnabled = [&](Loop *L) {
+      if (findMetadata(L) == true) {
+        if (!Worklist.empty()) {
+          // Because the loops are sorted in the order of the current nest, it
+          // is sufficient to compare with the last element.
+          unsigned InnerLoopId = Loop2Index[Worklist.back()] + 1;
+          unsigned OuterLoopId = Loop2Index[L];
+          if (OuterLoopId <= InnerLoopId) {
+            ORE->emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "AmbiguousOrder",
+                                              L->getStartLoc(), L->getHeader())
+                     << "The loops to be interchanged are overlapping.";
+            });
+            return false;
+          }
+        }
+        Worklist.push_back(L);
+      }
+      return true;
+    };
 
-        LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
-                   printDepMatrix(DependencyMatrix));
+    // Initialize Worklist. To process the loops in inner-loop-first order, add
+    // them to the worklist in the outer-loop-first order.
+    for (unsigned I = 0; I != LoopList.size(); I++)
+      if (!AddLoopIfEnabled(LoopList[I]))
+        return Changed;
+
+    // Set an upper bound of the number of transformations to avoid infinite
+    // loop. There is no deep meaning behind the current value (square of the
+    // size of LoopList).
+    // TODO: Is this really necessary?
+    const unsigned MaxAttemptsCount = LoopList.size() * LoopList.size();
+    unsigned Attempts = 0;
+
+    // Process the loops. An exchange is applied to two loops, but a metadata
+    // replacement can be applied to three loops: the two loops plus the next
+    // outer loop, if it exists. This is because it's necessary to express the
+    // information about the order of the application of interchanges in cases
+    // where the target loops to be exchanged are overlapping, e.g.,
+    //
+    // #pragma clang loop interchange(enable)
+    // for(int i=0;i<N;i++)
+    //   #pragma clang loop interchange(enable)
+    //   for (int j=0;j<N;j++)
+    //     for (int k=0;k<N;k++)
+    //       ...
+    //
+    // In this case we will exchange the innermost two loops at first, the
+    // follow-up metadata including enabling interchange is attached on the
+    // outermost loop, and it is enqueued as the next candidate to be processed.
+    while (!Worklist.empty() && Attempts < MaxAttemptsCount) {
+      Loop *TargetLoop = Worklist.pop_back_val();
+      assert(findMetadata(TargetLoop) == true &&
+             "Some metadata was unexpectedlly removed");
+      unsigned OuterLoopId = Loop2Index[TargetLoop];
+      unsigned InnerLoopId = OuterLoopId + 1;
+      if (InnerLoopId >= LoopList.size()) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "InnermostLoop",
+                                          TargetLoop->getStartLoc(),
+                                          TargetLoop->getHeader())
+                 << "The metadata is invalid with an innermost loop.";
+        });
+        break;
+      }
+      MDNode *LoopID = TargetLoop->getLoopID();
+      bool Interchanged = processLoop(LoopList, InnerLoopId, OuterLoopId,
+                                      DependencyMatrix, CostMap);
+      if (!Interchanged) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "NotInterchanged",
+                                          TargetLoop->getStartLoc(),
+                                          TargetLoop->getHeader())
+                 << "Failed to perform explicitly specified loop interchange.";
+        });
+        break;
       }
 
-      std::optional<MDNode *> MDOuterLoopID =
-          makeFollowupLoopID(MDOrigLoopID, {LLVMLoopInterchangeFollowupAll,
-                                            LLVMLoopInterchangeFollowupOuter});
-      if (MDOuterLoopID)
-        LoopList[OuterLoopId]->setLoopID(*MDOuterLoopID);
+      // The next outer loop, or nullptr if TargetLoop is the outermost one.
+      Loop *NextOuterLoop = nullptr;
+      if (0 < OuterLoopId)
+        NextOuterLoop = LoopList[OuterLoopId - 1];
+      Loop *OuterLoop = LoopList[OuterLoopId];
+      Loop *InnerLoop = LoopList[InnerLoopId];
+      Attempts++;
+      Changed = true;
+      Loop2Index[OuterLoop] = OuterLoopId;
+      Loop2Index[InnerLoop] = InnerLoopId;
 
+      // Update the metadata.
+      std::optional<MDNode *> MDNextOuterLoopID =
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupNextOuter});
+      std::optional<MDNode *> MDOuterLoopID =
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupOuter});
       std::optional<MDNode *> MDInnerLoopID =
-          makeFollowupLoopID(MDOrigLoopID, {LLVMLoopInterchangeFollowupAll,
-                                            LLVMLoopInterchangeFollowupInner});
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupInner});
+      if (MDNextOuterLoopID) {
+        if (NextOuterLoop) {
+          NextOuterLoop->setLoopID(*MDNextOuterLoopID);
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "New metadata for the next outer loop is ignored.\n");
+        }
+      }
+      if (MDOuterLoopID)
+        OuterLoop->setLoopID(*MDOuterLoopID);
       if (MDInnerLoopID)
-        LoopList[InnerLoopId]->setLoopID(*MDInnerLoopID);
-
-      Changed |= Interchanged;
+        InnerLoop->setLoopID(*MDInnerLoopID);
+
+      // Add new elements, paying attention to the order.
+      bool Valid = true;
+      if (NextOuterLoop)
+        Valid &= AddLoopIfEnabled(NextOuterLoop);
+      Valid &= AddLoopIfEnabled(OuterLoop);
+      Valid &= AddLoopIfEnabled(InnerLoop);
+      if (!Valid)
+        break;
     }
+
+    LLVM_DEBUG({
+      if (!Worklist.empty())
+        dbgs() << "Some metadata was ignored because the maximum number of "
+                  "attempts was reached.\n";
+    });
     return Changed;
   }
 };
diff --git a/llvm/test/Transforms/LoopInterchange/metadata-disable.ll b/llvm/test/Transforms/LoopInterchange/metadata-disable.ll
new file mode 100644
index 0000000000000..af7af8892cb35
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/metadata-disable.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s
+
+; Check that the interchange is not applied to the loop that is disabled by
+; metadata. The original code is as below:
+;
+; for (int i=0; i<128; i++)
+;   for (int j=0; j<128; j++)
+;    #pragma clang loop interchange(disable)
+;     for (int k=0; k<128; k++)
+;       for (int l=0; l<128; l++)
+;         a[l][k][j][i]++;
+;
+; Since interchanges are not be applied to the k-loop, the pair (i, j) is the
+; only candidate for exchange.
+
+ at a = dso_local local_unnamed_addr global [128 x [128 x [128 x [128 x i32]]]] zeroinitializer, align 4
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_HEADER_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[IV_I:%.*]] = phi i64 [ [[IV_I_NEXT:%.*]], %[[FOR_I_CLEANUP:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_K_HEADER:.*]]
+; CHECK:       [[FOR_J_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J_HEADER:.*]]
+; CHECK:       [[FOR_J_HEADER]]:
+; CHECK-NEXT:    [[IV_J:%.*]] = phi i64 [ [[IV_J_NEXT:%.*]], %[[FOR_J_CLEANUP:.*]] ], [ 0, %[[FOR_J_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_K_HEADER]]:
+; CHECK-NEXT:    [[IV_K:%.*]] = phi i64 [ 0, %[[FOR_I_HEADER]] ], [ [[IV_K_NEXT:%.*]], %[[FOR_K_CLEANUP:.*]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV_L:%.*]] = phi i64 [ 0, %[[FOR_K_HEADER]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 [[IV_L]], i64 [[IV_K]], i64 [[IV_J]], i64 [[IV_I]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[VAL]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[IV_L]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 128
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_K_CLEANUP]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_K_CLEANUP]]:
+; CHECK-NEXT:    [[IV_K_NEXT]] = add nuw nsw i64 [[IV_K]], 1
+; CHECK-NEXT:    [[EXITCOND_K:%.*]] = icmp eq i64 [[IV_K_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_K]], label %[[FOR_I_CLEANUP]], label %[[FOR_K_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_J_CLEANUP]]:
+; CHECK-NEXT:    [[IV_J_NEXT]] = add nuw nsw i64 [[IV_J]], 1
+; CHECK-NEXT:    [[EXITCOND_J:%.*]] = icmp eq i64 [[IV_J_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_J]], label %[[EXIT:.*]], label %[[FOR_J_HEADER]]
+; CHECK:       [[FOR_I_CLEANUP]]:
+; CHECK-NEXT:    [[IV_I_NEXT]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[IV_I_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_I]], label %[[FOR_J_CLEANUP]], label %[[FOR_I_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
+  br label %for.j.header
+
+for.j.header:
+  %iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
+  br label %for.k.header
+
+for.k.header:
+  %iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.k.cleanup ]
+  br label %for.body
+
+for.body:
+  %iv.l = phi i64 [ 0, %for.k.header ], [ %iv.l.next, %for.body ]
+  %ptr = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 %iv.l, i64 %iv.k, i64 %iv.j, i64 %iv.i
+  %val = load i32, ptr %ptr, align 4
+  %inc = add nuw nsw i32 %val, 1
+  store i32 %inc, ptr %ptr, align 4
+  %iv.l.next = add nuw nsw i64 %iv.l, 1
+  %exitcond.l = icmp eq i64 %iv.l.next, 128
+  br i1 %exitcond.l, label %for.k.cleanup, label %for.body
+
+for.k.cleanup:
+  %iv.k.next = add nuw nsw i64 %iv.k, 1
+  %exitcond.k = icmp eq i64 %iv.k.next, 128
+  br i1 %exitcond.k, label %for.j.cleanup, label %for.k.header, !llvm.loop !0
+
+for.j.cleanup:
+  %iv.j.next = add nuw nsw i64 %iv.j, 1
+  %exitcond.j = icmp eq i64 %iv.j.next, 128
+  br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header
+
+for.i.cleanup:
+  %iv.i.next = add nuw nsw i64 %iv.i, 1
+  %exitcond.i = icmp eq i64 %iv.i.next, 128
+  br i1 %exitcond.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interchange.enable", i1 false}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.interchange.enable", i1 false}
+;.
diff --git a/llvm/test/Transforms/LoopInterchange/metadata-interruption.ll b/llvm/test/Transforms/LoopInterchange/metadata-interruption.ll
new file mode 100644
index 0000000000000..aba7fc39b23a4
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/metadata-interruption.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -passes=loop-interchange -loop-interchange-only-when-forced=1 -pass-remarks-output=%t -disable-output
+; RUN: FileCheck -input-file %t %s
+
+; Test that the loop-interchange stops processing for some reason even though
+; some loops have metadata specyfing interchange enable.
+
+ at a = dso_local local_unnamed_addr global [128 x [128 x [128 x i32]]] zeroinitializer, align 4
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Innermost
+; CHECK-NEXT: Function:        enable_innermost
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:       The metadata is invalid with an innermost loop.
+define void @enable_innermost() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
+  br label %for.j.header
+
+for.j.header:
+  %iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
+  br label %for.body
+
+for.body:
+  %iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.body ]
+  %ptr = getelementptr inbounds nuw [128 x [128 x [128 x i32]]], ptr @a, i64 %iv.k, i64 %iv.j, i64 %iv.i
+  %val = load i32, ptr %ptr, align 4
+  %inc = add nuw nsw i32 %val, 1
+  store i32 %inc, ptr %ptr, align 4
+  %iv.k.next = add nuw nsw i64 %iv.k, 1
+  %exitcond.k = icmp eq i64 %iv.k.next, 128
+  br i1 %exitcond.k, label %for.j.cleanup, label %for.body, !llvm.loop !0
+
+for.j.cleanup:
+  %iv.j.next = add nuw nsw i64 %iv.j, 1
+  %exitcond.j = icmp eq i64 %iv.j.next, 128
+  br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header
+
+for.i.cleanup:
+  %iv.i.next = add nuw nsw i64 %iv.i, 1
+  %exitcond.i = icmp eq i64 %iv.i.next, 128
+  br i1 %exitcond.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            AmbigiousOrder
+; CHECK-NEXT: Function:        ambiguous_order
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:       The loops to be interchanged are overlapping.
+define void @ambiguous_order() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
+  br label %for.j.header
+
+for.j.header:
+  %iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
+  br label %for.body
+
+for.body:
+  %iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.body ]
+  %ptr = getelementptr inbounds nuw [128 x [128 x [128 x i32]]], ptr @a, i64 %iv.k, i64 %iv.j, i64 %iv.i
+  %val = load i32, ptr %ptr, align 4
+  %inc = add nuw nsw i32 %val, 1
+  store i32 %inc, ptr %ptr, align 4
+  %iv.k.next = add nuw nsw i64 %iv.k, 1
+  %exitcond.k = icmp eq i64 %iv.k.next, 128
+  br i1 %exitcond.k, label %for.j.cleanup, label %for.body
+
+for.j.cleanup:
+  %iv.j.next = add nuw nsw i64 %iv.j, 1
+  %exitcond.j = icmp eq i64 %iv.j.next, 128
+  br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header, !llvm.loop !0
+
+for.i.cleanup:
+  %iv.i.next = add nuw nsw i64 %iv.i, 1
+  %exitcond.i = icmp eq i64 %iv.i.next, 128
+  br i1 %exitcond.i, label %exit, label %for.i.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interchange.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopInterchange/metadata.ll b/llvm/test/Transforms/LoopInterchange/metadata.ll
index 9838abb905a7e..e31bc3cb8e907 100644
--- a/llvm/test/Transforms/LoopInterchange/metadata.ll
+++ b/llvm/test/Transforms/LoopInterchange/metadata.ll
@@ -1,325 +1,107 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-ON
-; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=1 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-OFF
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=1 --cache-line-size=64 -S < %s | FileCheck %s
 
-; Test if the metadata works correctly. The code is as follows:
-; 
-; #define N 4
-; int a[N][N][N][N];
-; int b[N][N][N][N];
-; void f() {
-;   for (int i = 0; i < N; i++)
-;     for (int j = 0; j < N; j++)
-;       #pragma clang loop interchange(enable or disable)
-;       for (int k = 0; k < N; k++)
-;         for (int l = 0; l < N; l++)
-;           a[l][k][j][i] += b[l][k][j][i];
-; }
-;
-; In the functions explicit_on and explicit_off, the values enable and disable
-; are specified in the pragma, respectively. If the
-; `loop-interchange-only-when-forced` is set to 0, the loop-interchange will be
-; performed to the loop nest unless it is explicitly disabled. If the value is
-; set to 1, the loop-interchange will be performed to the loop nest only when
-; it is explicitly enabled.
-
- at a = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
- at b = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
+ at a = dso_local local_unnamed_addr global [128 x [128 x [128 x i32]]] zeroinitializer, align 4
 
-define void @explicit_on() {
-; DEFAULT-ON-LABEL: define void @explicit_on() {
-; DEFAULT-ON-NEXT:  [[ENTRY:.*:]]
-; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND1_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12_SPLIT1:.*]]
-; DEFAULT-ON:       [[FOR_COND5_PREHEADER_PREHEADER:.*]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP3]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
-; DEFAULT-ON-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP7_SPLIT:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP7:.*]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP3]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP7_SPLIT]]:
-; DEFAULT-ON-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV57]], 1
-; DEFAULT-ON-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 2
-; DEFAULT-ON-NEXT:    br i1 [[TMP1]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP11_SPLIT:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP11:.*]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP7]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP11_SPLIT]]:
-; DEFAULT-ON-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV53]], 1
-; DEFAULT-ON-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 2
-; DEFAULT-ON-NEXT:    br i1 [[TMP3]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; DEFAULT-ON:       [[FOR_BODY12:.*]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP6:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_BODY12_SPLIT1]]:
-; DEFAULT-ON-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-ON-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
-; DEFAULT-ON-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-ON-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
-; DEFAULT-ON-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
-; DEFAULT-ON-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND_CLEANUP11]]
-; DEFAULT-ON:       [[FOR_BODY12_SPLIT]]:
-; DEFAULT-ON-NEXT:    [[TMP6]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-ON-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 2
-; DEFAULT-ON-NEXT:    br i1 [[TMP7]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT-ON:       [[FOR_COND9_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV53]] = phi i64 [ [[TMP2]], %[[FOR_COND_CLEANUP11_SPLIT]] ], [ 0, %[[FOR_COND9_PREHEADER_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER_PREHEADER]]
-; DEFAULT-ON:       [[FOR_BODY12_PREHEADER]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12]]
-; DEFAULT-ON:       [[FOR_COND5_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV57]] = phi i64 [ [[TMP0]], %[[FOR_COND_CLEANUP7_SPLIT]] ], [ 0, %[[FOR_COND5_PREHEADER_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER_PREHEADER]]
-; DEFAULT-ON:       [[FOR_COND9_PREHEADER_PREHEADER]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP]]:
-; DEFAULT-ON-NEXT:    ret void
+; Check that the interchanges are applied in the expected order. The original
+; code looks like as follows:
 ;
-; DEFAULT-OFF-LABEL: define void @explicit_on() {
-; DEFAULT-OFF-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; DEFAULT-OFF:       [[FOR_COND1_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP3]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP7:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP3]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP11:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
-; DEFAULT-OFF:       [[FOR_BODY12:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP2:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER:.*]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
-; DEFAULT-OFF:       [[FOR_BODY12_SPLIT1:.*]]:
-; DEFAULT-OFF-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-OFF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
-; DEFAULT-OFF-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-OFF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
-; DEFAULT-OFF-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; DEFAULT-OFF-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND_CLEANUP11]]
-; DEFAULT-OFF:       [[FOR_BODY12_SPLIT]]:
-; DEFAULT-OFF-NEXT:    [[TMP2]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-OFF-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[TMP3]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP7]]
-; DEFAULT-OFF:       [[FOR_COND9_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV53]] = phi i64 [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ], [ 0, %[[FOR_COND9_PREHEADER_PREHEADER]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12_SPLIT1]]
-; DEFAULT-OFF:       [[FOR_BODY12_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12]]
-; DEFAULT-OFF:       [[FOR_COND5_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV57]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12_PREHEADER]]
-; DEFAULT-OFF:       [[FOR_COND9_PREHEADER_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP]]:
-; DEFAULT-OFF-NEXT:    ret void
+; #pragma clang loop interchange(enable)
+; for (int j=0; j<128; j++)
+;   #pragma clang loop interchange(enable)
+;   for (int i=0; i<128; i++)
+;     for (int k=0; k<128; k++)
+;       a[k][j][i]++;
 ;
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv61 = phi i64 [ 0, %entry ], [ %indvars.iv.next62, %for.cond.cleanup3 ]
-  br label %for.cond5.preheader
-
-for.cond.cleanup3:
-  %indvars.iv.next62 = add nuw nsw i64 %indvars.iv61, 1
-  %exitcond64 = icmp ne i64 %indvars.iv.next62, 2
-  br i1 %exitcond64, label %for.cond1.preheader, label %for.cond.cleanup
-
-for.cond.cleanup7:
-  %indvars.iv.next58 = add nuw nsw i64 %indvars.iv57, 1
-  %exitcond60 = icmp ne i64 %indvars.iv.next58, 2
-  br i1 %exitcond60, label %for.cond5.preheader, label %for.cond.cleanup3
-
-for.cond.cleanup11:
-  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
-  %exitcond56 = icmp ne i64 %indvars.iv.next54, 2
-  br i1 %exitcond56, label %for.cond9.preheader, label %for.cond.cleanup7, !llvm.loop !0
-
-for.body12:
-  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
-  %arrayidx18 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
-  %0 = load i32, ptr %arrayidx18, align 4
-  %arrayidx26 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
-  %1 = load i32, ptr %arrayidx26, align 4
-  %add = add nsw i32 %1, %0
-  store i32 %add, ptr %arrayidx26, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 2
-  br i1 %exitcond, label %for.body12, label %for.cond.cleanup11
-
-for.cond9.preheader:
-  %indvars.iv53 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next54, %for.cond.cleanup11 ]
-  br label %for.body12
-
-for.cond5.preheader:
-  %indvars.iv57 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next58, %for.cond.cleanup7 ]
-  br label %for.cond9.preheader
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @explicit_off() {
-; DEFAULT-ON-LABEL: define void @explicit_off() {
-; DEFAULT-ON-NEXT:  [[ENTRY:.*:]]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND1_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND9_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND5_PREHEADER_PREHEADER]]:
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP3]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
-; DEFAULT-ON-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP7:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP7]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
-; DEFAULT-ON-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP11:.*]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
-; DEFAULT-ON-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER]], label %[[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP2:![0-9]+]]
-; DEFAULT-ON:       [[FOR_BODY12:.*]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_COND9_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY12]] ]
-; DEFAULT-ON-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-ON-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
-; DEFAULT-ON-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-ON-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
-; DEFAULT-ON-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; DEFAULT-ON-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
-; DEFAULT-ON-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-ON-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
-; DEFAULT-ON-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP11]]
-; DEFAULT-ON:       [[FOR_COND9_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV53]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_BODY12]]
-; DEFAULT-ON:       [[FOR_COND5_PREHEADER]]:
-; DEFAULT-ON-NEXT:    [[INDVARS_IV57]] = phi i64 [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ], [ 0, %[[FOR_COND5_PREHEADER_PREHEADER]] ]
-; DEFAULT-ON-NEXT:    br label %[[FOR_COND1_PREHEADER_PREHEADER]]
-; DEFAULT-ON:       [[FOR_COND_CLEANUP]]:
-; DEFAULT-ON-NEXT:    ret void
+; At first the interchange is applied to the j-loop and the k-loop. The
+; follow-up metadata is attached to the outermost loop, then the interchange to
+; the i-loop and the k-loop.
 ;
-; DEFAULT-OFF-LABEL: define void @explicit_off() {
-; DEFAULT-OFF-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; DEFAULT-OFF:       [[FOR_COND1_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV61:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP3]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP7:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP3]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP11:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_COND_CLEANUP7]], !llvm.loop [[LOOP2:![0-9]+]]
-; DEFAULT-OFF:       [[FOR_BODY12:.*]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_COND9_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY12]] ]
-; DEFAULT-OFF-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-OFF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
-; DEFAULT-OFF-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
-; DEFAULT-OFF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
-; DEFAULT-OFF-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; DEFAULT-OFF-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-OFF-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
-; DEFAULT-OFF-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP11]]
-; DEFAULT-OFF:       [[FOR_COND9_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV53]] = phi i64 [ 0, %[[FOR_COND5_PREHEADER]] ], [ [[INDVARS_IV_NEXT54]], %[[FOR_COND_CLEANUP11]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_BODY12]]
-; DEFAULT-OFF:       [[FOR_COND5_PREHEADER]]:
-; DEFAULT-OFF-NEXT:    [[INDVARS_IV57]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT58]], %[[FOR_COND_CLEANUP7]] ]
-; DEFAULT-OFF-NEXT:    br label %[[FOR_COND9_PREHEADER]]
-; DEFAULT-OFF:       [[FOR_COND_CLEANUP]]:
-; DEFAULT-OFF-NEXT:    ret void
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[IV_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_I_NEXT:%.*]], %[[FOR_I_CLEANUP:.*]] ]
+; CHECK-NEXT:    br label %[[FOR_J_HEADER:.*]]
+; CHECK:       [[FOR_J_HEADER]]:
+; CHECK-NEXT:    [[IV_J:%.*]] = phi i64 [ 0, %[[FOR_I_HEADER]] ], [ [[IV_J_NEXT:%.*]], %[[FOR_J_CLEANUP:.*]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV_K:%.*]] = phi i64 [ 0, %[[FOR_J_HEADER]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds nuw [128 x [128 x [128 x i32]]], ptr @a, i64 [[IV_K]], i64 [[IV_J]], i64 [[IV_I]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[VAL]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[IV_K]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 128
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_J_CLEANUP]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_J_CLEANUP]]:
+; CHECK-NEXT:    [[IV_J_NEXT]] = add nuw nsw i64 [[IV_J]], 1
+; CHECK-NEXT:    [[EXITCOND_J:%.*]] = icmp eq i64 [[IV_J_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_J]], label %[[FOR_I_CLEANUP]], label %[[FOR_J_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_I_CLEANUP]]:
+; CHECK-NEXT:    [[IV_I_NEXT]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[IV_I_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_I]], label %[[EXIT:.*]], label %[[FOR_I_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv61 = phi i64 [ 0, %entry ], [ %indvars.iv.next62, %for.cond.cleanup3 ]
-  br label %for.cond5.preheader
-
-for.cond.cleanup3:
-  %indvars.iv.next62 = add nuw nsw i64 %indvars.iv61, 1
-  %exitcond64 = icmp ne i64 %indvars.iv.next62, 2
-  br i1 %exitcond64, label %for.cond1.preheader, label %for.cond.cleanup
-
-for.cond.cleanup7:
-  %indvars.iv.next58 = add nuw nsw i64 %indvars.iv57, 1
-  %exitcond60 = icmp ne i64 %indvars.iv.next58, 2
-  br i1 %exitcond60, label %for.cond5.preheader, label %for.cond.cleanup3
-
-for.cond.cleanup11:
-  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
-  %exitcond56 = icmp ne i64 %indvars.iv.next54, 2
-  br i1 %exitcond56, label %for.cond9.preheader, label %for.cond.cleanup7, !llvm.loop !2
-
-for.body12:
-  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
-  %arrayidx18 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
-  %0 = load i32, ptr %arrayidx18, align 4
-  %arrayidx26 = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 %indvars.iv, i64 %indvars.iv53, i64 %indvars.iv57, i64 %indvars.iv61
-  %1 = load i32, ptr %arrayidx26, align 4
-  %add = add nsw i32 %1, %0
-  store i32 %add, ptr %arrayidx26, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 2
-  br i1 %exitcond, label %for.body12, label %for.cond.cleanup11
-
-for.cond9.preheader:
-  %indvars.iv53 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next54, %for.cond.cleanup11 ]
-  br label %for.body12
-
-for.cond5.preheader:
-  %indvars.iv57 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next58, %for.cond.cleanup7 ]
-  br label %for.cond9.preheader
-
-for.cond.cleanup:
+  br label %for.i.header
+
+for.i.header:
+  %iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
+  br label %for.j.header
+
+for.j.header:
+  %iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
+  br label %for.body
+
+for.body:
+  %iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.body ]
+  %ptr = getelementptr inbounds nuw [128 x [128 x [128 x i32]]], ptr @a, i64 %iv.k, i64 %iv.j, i64 %iv.i
+  %val = load i32, ptr %ptr, align 4
+  %inc = add nuw nsw i32 %val, 1
+  store i32 %inc, ptr %ptr, align 4
+  %iv.k.next = add nuw nsw i64 %iv.k, 1
+  %exitcond.k = icmp eq i64 %iv.k.next, 128
+  br i1 %exitcond.k, label %for.j.cleanup, label %for.body
+
+for.j.cleanup:
+  %iv.j.next = add nuw nsw i64 %iv.j, 1
+  %exitcond.j = icmp eq i64 %iv.j.next, 128
+  br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header, !llvm.loop !0
+
+for.i.cleanup:
+  %iv.i.next = add nuw nsw i64 %iv.i, 1
+  %exitcond.i = icmp eq i64 %iv.i.next, 128
+  br i1 %exitcond.i, label %exit, label %for.i.header, !llvm.loop !5
+
+exit:
   ret void
 }
 
-!0 = distinct !{!0, !1}
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.interchange.enable", i1 true}
-!2 = distinct !{!2, !3}
-!3 = !{!"llvm.loop.interchange.enable", i1 false}
-;.
-; DEFAULT-ON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; DEFAULT-ON: [[META1]] = !{!"llvm.loop.interchange.enable", i1 true}
-; DEFAULT-ON: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
-; DEFAULT-ON: [[META3]] = !{!"llvm.loop.interchange.enable", i1 false}
+!2 = !{!"llvm.loop.interchange.followup_all", !{!"FolloupAll"}}
+!3 = !{!"llvm.loop.interchange.followup_inner", !{!"FollowupInner0"}}
+!4 = !{!"llvm.loop.interchange.followup_outer", !{!"FollowupOuter0"}}
+!5 = !{!"llvm.loop.interchange.followup_next_outer", !1, !2, !6, !7}
+!6 = !{!"llvm.loop.interchange.followup_inner", !{!"FollowupInner1"}}
+!7 = !{!"llvm.loop.interchange.followup_outer", !{!"FollowupOuter1"}}
 ;.
-; DEFAULT-OFF: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
-; DEFAULT-OFF: [[META1]] = !{!"llvm.loop.interchange.enable", i1 true}
-; DEFAULT-OFF: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
-; DEFAULT-OFF: [[META3]] = !{!"llvm.loop.interchange.enable", i1 false}
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META4:![0-9]+]], [[META6:![0-9]+]], [[LOOP8]]}
+; CHECK: [[META1]] = !{!"llvm.loop.interchange.enable", i1 true}
+; CHECK: [[META2]] = !{!"llvm.loop.interchange.followup_all", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"FolloupAll"}
+; CHECK: [[META4]] = !{!"llvm.loop.interchange.followup_inner", [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"FollowupInner0"}
+; CHECK: [[META6]] = !{!"llvm.loop.interchange.followup_outer", [[META7:![0-9]+]]}
+; CHECK: [[META7]] = !{!"FollowupOuter0"}
+; CHECK: [[LOOP8]] = !{!"llvm.loop.interchange.followup_next_outer", [[META1]], [[META2]], [[META9:![0-9]+]], [[META11:![0-9]+]]}
+; CHECK: [[META9]] = !{!"llvm.loop.interchange.followup_inner", [[META10:![0-9]+]]}
+; CHECK: [[META10]] = !{!"FollowupInner1"}
+; CHECK: [[META11]] = !{!"llvm.loop.interchange.followup_outer", [[META12:![0-9]+]]}
+; CHECK: [[META12]] = !{!"FollowupOuter1"}
 ;.



More information about the llvm-commits mailing list