[llvm] [LoopInterchange] Relax the legality check to accept more patterns (PR #118267)

Sun Dec 1 23:22:55 PST 2024

https://github.com/kasuga-fj created https://github.com/llvm/llvm-project/pull/118267

We lose opportunities to interchange loops because the current legality check is stricter than necessary. This patch relaxes the restriction and increases the number of acceptable patterns. Here is a motivating example.

```
for (int nl=0;nl<100;nl++) {
  for (int i=0;i<256;i++) {
    for (int j=1;j<256;j++)
      aa[j][i] = aa[j-1][i] + bb[j][i];
  }
  dummy(aa, bb);
}
```

This patch allows us to interchange the two innermost loops. Note, however, that the current implementation interchanges these loops twice so that they end up going back in the original order.

Related issue: https://github.com/llvm/llvm-project/issues/71519

>From e5f6b7d0e682c3ed797a40d2a45b7fcb879ef31e Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Wed, 27 Nov 2024 12:32:24 +0000
Subject: [PATCH] [LoopInterchange] Relax the legality check to accept more
 patterns

We lose opportunities to interchange loops because the current legality
check is stricter than necessary. This patch relaxes the restriction and
increases the number of acceptable patterns. Here is a motivating
example.

```
for (int nl=0;nl<100;nl++) {
  for (int i=0;i<256;i++) {
    for (int j=1;j<256;j++)
      aa[j][i] = aa[j-1][i] + bb[j][i];
  }
  dummy(aa, bb);
}
```

This patch allows us to interchange the two innermost loops. Note,
however, that the current implementation interchanges these loops twice
so that they end up going back in the original order.

Related issue: https://github.com/llvm/llvm-project/issues/71519
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 39 +++++++---
 .../direction-vector-legality-negative.ll     | 74 +++++++++++++++++++
 .../direction-vector-legality-none.ll         | 54 ++++++++++++++
 .../direction-vector-legality-opposite.ll     | 53 +++++++++++++
 4 files changed, 211 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/direction-vector-legality-negative.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/direction-vector-legality-none.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/direction-vector-legality-opposite.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a0c0080c0bda1c..29a53a258f4ca1 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -64,6 +64,15 @@ using LoopVector = SmallVector<Loop *, 8>;
 // TODO: Check if we can use a sparse matrix here.
 using CharMatrix = std::vector<std::vector<char>>;
 
+// Classify direction vectors according to the leftmost non-"=" direction. "S"
+// and "I" are treated the same as "=".
+enum class DirectionVectorOrder {
+  Zero,     ///< The direction vector consists only of "=", "S", and "I".
+  Positive, ///< The leftmost non-"=" direction is "<".
+  Negative, ///< The leftmost non-"=" direction is ">".
+  All,      ///< The leftmost non-"=" direction is "*".
+};
+
 } // end anonymous namespace
 
 // Maximum number of dependencies that can be handled in the dependency matrix.
@@ -185,15 +194,25 @@ static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
 // After interchanging, check if the direction vector is valid.
 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
 // if the direction matrix, after the same permutation is applied to its
-// columns, has no ">" direction as the leftmost non-"=" direction in any row.
-static bool isLexicographicallyPositive(std::vector<char> &DV) {
+// columns, each row of it satisfies either the following conditions.
+//
+// - The row consists only of "=", "S", and "I".
+// - The leftmost direction that is not "=", "S" and "I" in the row is
+//   "<" or ">", and it does not change before and after the permutation is
+//   applied.
+static DirectionVectorOrder
+calcDirectionVectorOrder(const std::vector<char> &DV) {
   for (unsigned char Direction : DV) {
-    if (Direction == '<')
-      return true;
-    if (Direction == '>' || Direction == '*')
-      return false;
+    switch (Direction) {
+    case '<':
+      return DirectionVectorOrder::Positive;
+    case '>':
+      return DirectionVectorOrder::Negative;
+    case '*':
+      return DirectionVectorOrder::All;
+    }
   }
-  return true;
+  return DirectionVectorOrder::Zero;
 }
 
 // Checks if it is legal to interchange 2 loops.
@@ -207,10 +226,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
     // Create temporary DepVector check its lexicographical order
     // before and after swapping OuterLoop vs InnerLoop
     Cur = DepMatrix[Row];
-    if (!isLexicographicallyPositive(Cur))
+    auto OrderBefore = calcDirectionVectorOrder(Cur);
+    if (OrderBefore == DirectionVectorOrder::All)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (!isLexicographicallyPositive(Cur))
+    auto OrderAfter = calcDirectionVectorOrder(Cur);
+    if (OrderBefore != OrderAfter)
       return false;
   }
   return true;
diff --git a/llvm/test/Transforms/LoopInterchange/direction-vector-legality-negative.ll b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-negative.ll
new file mode 100644
index 00000000000000..b81a2c96c9cb72
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-negative.ll
@@ -0,0 +1,74 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
+; RUN:     -S -debug 2>&1 | FileCheck %s
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+
+declare i32 @dummy(ptr noundef, ptr noundef)
+
+;;  for (int nl=0;nl<100;++nl) {
+;;    for (int i=0;i<256;++i) {
+;;      for (int j=1;j<256;++j)
+;;        aa[j][i] = aa[j-1][i] + bb[j][i];
+;;    }
+;;    dummy(aa, bb);
+;;  }
+;;
+;; The direction vector of `aa` is [S = >]. We can swap the innermost two
+;; loops, The direction vector after interchanging will be [S > =].
+
+; CHECK: Dependency matrix before interchange:
+; CHECK-NEXT: S = >
+; CHECK-NEXT: S = =
+; CHECK-NEXT: Processing InnerLoopId = 2 and OuterLoopId = 1
+; CHECK-NEXT: Checking if loops are tightly nested
+; CHECK-NEXT: Checking instructions in Loop header and Loop latch
+; CHECK-NEXT: Loops are perfectly nested
+; CHECK-NEXT: Loops are legal to interchange
+; CHECK: Dependency matrix after interchange:
+; CHECK-NEXT: S > =
+; CHECK-NEXT: S = =
+
+define void @f() {
+entry:
+  br label %for.cond1.preheader
+
+; Loop:
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %nl.036 = phi i32 [ 0, %entry ], [ %inc23, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7
+  %call = tail call i32 @dummy(ptr noundef nonnull @aa, ptr noundef nonnull @bb)
+  %inc23 = add nuw nsw i32 %nl.036, 1
+  %exitcond43 = icmp ne i32 %inc23, 100
+  br i1 %exitcond43, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next40, 256
+  br i1 %exitcond42, label %for.cond5.preheader, label %for.cond.cleanup3
+
+for.body8:                                        ; preds = %for.cond5.preheader, %for.body8
+  %indvars.iv = phi i64 [ 1, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %0 = add nsw i64 %indvars.iv, -1
+  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %0, i64 %indvars.iv39
+  %1 = load float, ptr %arrayidx10, align 4
+  %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv39
+  %2 = load float, ptr %arrayidx14, align 4
+  %add = fadd fast float %2, %1
+  %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv39
+  store float %add, ptr %arrayidx18, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.body8, label %for.cond.cleanup7
+
+for.cond5.preheader:                              ; preds = %for.cond1.preheader, %for.cond.cleanup7
+  %indvars.iv39 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next40, %for.cond.cleanup7 ]
+  br label %for.body8
+
+; Exit blocks
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/direction-vector-legality-none.ll b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-none.ll
new file mode 100644
index 00000000000000..6b131a74148a08
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-none.ll
@@ -0,0 +1,54 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
+; RUN:     -S -debug 2>&1 | FileCheck %s
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+
+;;  for (int i=0;i<256;++i)
+;;    for (int j=1;j<256;++j)
+;;      aa[j][i] = aa[j-1][255-i] + bb[j][i];
+;;
+;; The direciton vector of `aa` is [* =]. We cannot interchange the loops
+;; because we must handle a `*` dependence conservatively.
+
+; CHECK: Dependency matrix before interchange:
+; CHECK-NEXT: * >
+; CHECK-NEXT: Processing InnerLoopId = 1 and OuterLoopId = 0
+; CHECK-NEXT: Failed interchange InnerLoopId = 1 and OuterLoopId = 0 due to dependence
+; CHECK-NEXT: Not interchanging loops. Cannot prove legality.
+
+define void @f() {
+; Preheader:
+entry:
+  br label %for.cond1.preheader
+
+; Loop:
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %indvars.iv31 = phi i64 [ 0, %entry ], [ %indvars.iv.next32, %for.cond.cleanup3 ]
+  %0 = sub nuw nsw i64 255, %indvars.iv31
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %indvars.iv.next32 = add nuw nsw i64 %indvars.iv31, 1
+  %exitcond35 = icmp ne i64 %indvars.iv.next32, 256
+  br i1 %exitcond35, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.body4:                                        ; preds = %for.cond1.preheader, %for.body4
+  %indvars.iv = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %1 = add nsw i64 %indvars.iv, -1
+  %arrayidx7 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %1, i64 %0
+  %2 = load float, ptr %arrayidx7, align 4
+  %arrayidx11 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv31
+  %3 = load float, ptr %arrayidx11, align 4
+  %add = fadd fast float %3, %2
+  %arrayidx15 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv31
+  store float %add, ptr %arrayidx15, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.body4, label %for.cond.cleanup3
+
+; Exit blocks
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/direction-vector-legality-opposite.ll b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-opposite.ll
new file mode 100644
index 00000000000000..978b21e195101f
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/direction-vector-legality-opposite.ll
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info \
+; RUN:     -S -debug 2>&1 | FileCheck %s
+
+ at aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+ at bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+
+;;  for (int i=0;i<255;++i)
+;;    for (int j=1;j<256;++j)
+;;      aa[j][i] = aa[j-1][i+1] + bb[j][i];
+;;
+;; The direciton vector of `aa` is [< >]. We cannot interchange the loops
+;; because the read/write order for `aa` cannot be changed.
+
+; CHECK: Dependency matrix before interchange:
+; CHECK-NEXT: < >
+; CHECK-NEXT: Processing InnerLoopId = 1 and OuterLoopId = 0
+; CHECK-NEXT: Failed interchange InnerLoopId = 1 and OuterLoopId = 0 due to dependence
+; CHECK-NEXT: Not interchanging loops. Cannot prove legality.
+
+define void @f() {
+; Preheader:
+entry:
+  br label %for.cond1.preheader
+
+; Loop:
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %indvars.iv31 = phi i64 [ 0, %entry ], [ %indvars.iv.next32, %for.cond.cleanup3 ]
+  %indvars.iv.next32 = add nuw nsw i64 %indvars.iv31, 1
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4
+  %exitcond34 = icmp ne i64 %indvars.iv.next32, 255
+  br i1 %exitcond34, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.body4:                                        ; preds = %for.cond1.preheader, %for.body4
+  %indvars.iv = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %0 = add nsw i64 %indvars.iv, -1
+  %arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %0, i64 %indvars.iv.next32
+  %1 = load float, ptr %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv31
+  %2 = load float, ptr %arrayidx10, align 4
+  %add11 = fadd fast float %2, %1
+  %arrayidx15 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv31
+  store float %add11, ptr %arrayidx15, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.body4, label %for.cond.cleanup3
+
+; Exit blocks
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+}