[llvm] eac3487 - [LoopInterchange] Try to achieve the most optimal access pattern after interchange

Wed Apr 6 12:32:34 PDT 2022

Author: Congzhe Cao
Date: 2022-04-06T15:31:56-04:00
New Revision: eac34875109898ac01985f4afa937eec30c1c387

URL: https://github.com/llvm/llvm-project/commit/eac34875109898ac01985f4afa937eec30c1c387
DIFF: https://github.com/llvm/llvm-project/commit/eac34875109898ac01985f4afa937eec30c1c387.diff

LOG: [LoopInterchange] Try to achieve the most optimal access pattern after interchange

Motivated by pr43326 (https://bugs.llvm.org/show_bug.cgi?id=43326), where a slightly
modified case is as follows.

 void f(int e[10][10][10], int f[10][10][10]) {
   for (int a = 0; a < 10; a++)
     for (int b = 0; b < 10; b++)
       for (int c = 0; c < 10; c++)
         f[c][b][a] = e[c][b][a];
 }

The ideal optimal access pattern after running interchange is supposed to be the following

 void f(int e[10][10][10], int f[10][10][10]) {
   for (int c = 0;  c < 10; c++)
     for (int b = 0; b < 10; b++)
       for (int a = 0; a < 10; a++)
         f[c][b][a] = e[c][b][a];
 }

Currently loop interchange is limited to picking up the innermost loop and finding an order
that is locally optimal for it. However, the pass failed to produce the globally optimal
loop access order. For more complex examples what we get could be quite far from the
globally optimal ordering.

What is proposed in this patch is to do a "bubble-sort" fashion when doing interchange.
By comparing neighbors in `LoopList` in each iteration, we would be able to move each loop
onto a most appropriate place, hence this is an approach that tries to achieve the
globally optimal ordering.

The motivating example above is added as a test case.

Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D120386

Added: 
    llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopInterchange.cpp
    llvm/test/Transforms/LoopInterchange/phi-ordering.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index c59d39e9d7a1c..ced4396d5e798 100644

--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -266,26 +266,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
   return true;
 }
 
-static LoopVector populateWorklist(Loop &L) {
+static void populateWorklist(Loop &L, LoopVector &LoopList) {
   LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
                     << L.getHeader()->getParent()->getName() << " Loop: %"
                     << L.getHeader()->getName() << '\n');
-  LoopVector LoopList;
+  assert(LoopList.empty() && "LoopList should initially be empty!");
   Loop *CurrentLoop = &L;
   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
   while (!Vec->empty()) {
     // The current loop has multiple subloops in it hence it is not tightly
     // nested.
     // Discard all loops above it added into Worklist.
-    if (Vec->size() != 1)
-      return {};
+    if (Vec->size() != 1) {
+      LoopList = {};
+      return;
+    }
 
     LoopList.push_back(CurrentLoop);
     CurrentLoop = Vec->front();
     Vec = &CurrentLoop->getSubLoops();
   }
   LoopList.push_back(CurrentLoop);
-  return LoopList;
+  return;
 }
 
 namespace {
@@ -419,12 +421,13 @@ struct LoopInterchange {
   bool run(Loop *L) {
     if (L->getParentLoop())
       return false;
-
-    return processLoopList(populateWorklist(*L));
+    SmallVector<Loop *, 8> LoopList;
+    populateWorklist(*L, LoopList);
+    return processLoopList(LoopList);
   }
 
   bool run(LoopNest &LN) {
-    const auto &LoopList = LN.getLoops();
+    SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
     for (unsigned I = 1; I < LoopList.size(); ++I)
       if (LoopList[I]->getParentLoop() != LoopList[I - 1])
         return false;
@@ -456,7 +459,7 @@ struct LoopInterchange {
     return LoopList.size() - 1;
   }
 
-  bool processLoopList(ArrayRef<Loop *> LoopList) {
+  bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
     bool Changed = false;
     unsigned LoopNestDepth = LoopList.size();
     if (LoopNestDepth < 2) {
@@ -496,20 +499,32 @@ struct LoopInterchange {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
-    // Move the selected loop outwards to the best possible position.
-    Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
-    for (unsigned i = SelecLoopId; i > 0; i--) {
-      bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i,
-                                      i - 1, DependencyMatrix);
-      if (!Interchanged)
-        return Changed;
-      // Update the DependencyMatrix
-      interChangeDependencies(DependencyMatrix, i, i - 1);
+    // We try to achieve the globally optimal memory access for the loopnest,
+    // and do interchange based on a bubble-sort fasion. We start from
+    // the innermost loop, move it outwards to the best possible position
+    // and repeat this process.
+    for (unsigned j = SelecLoopId; j > 0; j--) {
+      bool ChangedPerIter = false;
+      for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
+        bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
+                                        DependencyMatrix);
+        if (!Interchanged)
+          continue;
+        // Loops interchanged, update LoopList accordingly.
+        std::swap(LoopList[i - 1], LoopList[i]);
+        // Update the DependencyMatrix
+        interChangeDependencies(DependencyMatrix, i, i - 1);
 #ifdef DUMP_DEP_MATRICIES
-      LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
-      printDepMatrix(DependencyMatrix);
+        LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
+        printDepMatrix(DependencyMatrix);
 #endif
-      Changed |= Interchanged;
+        ChangedPerIter |= Interchanged;
+        Changed |= Interchanged;
+      }
+      // Early abort if there was no interchange during an entire round of
+      // moving loops outwards.
+      if (!ChangedPerIter)
+        break;
     }
     return Changed;
   }
@@ -1419,9 +1434,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
 
     // Incoming values are guaranteed be instructions currently.
     auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
+    // In case of multi-level nested loops, follow LCSSA to find the incoming
+    // value defined from the innermost loop.
+    auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
     // Skip phis with incoming values from the inner loop body, excluding the
     // header and latch.
-    if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
+    if (IncIInnerMost->getParent() != InnerLatch &&
+        IncIInnerMost->getParent() != InnerHeader)
       continue;
 
     assert(all_of(P.users(),

diff  --git a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
index 00dc5c3d74dc7..78e333b83d3a4 100644
--- a/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
+++ b/llvm/test/Transforms/LoopInterchange/phi-ordering.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=0 -S 2>&1 | FileCheck %s
 ;; Checks the order of the inner phi nodes does not cause havoc.
 ;; The inner loop has a reduction into c. The IV is not the first phi.
 
@@ -9,7 +9,7 @@ target triple = "armv8--linux-gnueabihf"
 
 
 ; Function Attrs: norecurse nounwind
-define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
+define void @test(i32 %T, [90 x i32]* noalias nocapture %C, [90 x [90 x i16]]* noalias nocapture readonly %A, i16* noalias nocapture readonly %B) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR3_PREHEADER:%.*]]
@@ -31,7 +31,7 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    br label [[FOR1_HEADER_PREHEADER]]
 ; CHECK:       for3.split1:
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* [[A:%.*]], i32 [[ADD]], i32 [[J]], i32 [[I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1
 ; CHECK-NEXT:    store i16 [[ADD15]], i16* [[ARRAYIDX]]
@@ -70,7 +70,7 @@ for2.header:                                  ; preds = %for2.inc16, %for1.heade
 for3:                                        ; preds = %for3, %for2.header
   %k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ]
   %add = add nsw i32 %k, %mul
-  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+  %arrayidx = getelementptr inbounds [90 x [90 x i16]], [90 x [90 x i16]]* %A, i32 %add, i32 %j, i32 %i
   %0 = load i16, i16* %arrayidx, align 2
   %add15 = add nsw i16 %0, 1
   store i16 %add15, i16* %arrayidx

diff  --git a/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
new file mode 100644
index 0000000000000..443f61fbe2a0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/pr43326-ideal-access-pattern.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basic-aa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa -stats 2>&1
+; RUN: FileCheck --input-file=%t --check-prefix=REMARKS %s
+
+; Triply nested loop, should be able to do interchange three times
+; to get the ideal access pattern.
+; void f(int e[10][10][10], int f[10][10][10]) {
+;   for (int a = 0; a < 10; a++) {
+;     for (int b = 0; b < 10; b++) {
+;       for (int c = 0; c < 10; c++) {
+;         f[c][b][a] = e[c][b][a];
+;       }
+;     }
+;   }
+; }
+
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        pr43326-triply-nested
+
+define void @pr43326-triply-nested([10 x [10 x i32]]* %e, [10 x [10 x i32]]* %f) {
+entry:
+  br label %for.outermost.header
+
+for.outermost.header:                              ; preds = %entry, %for.outermost.latch
+  %indvars.outermost = phi i64 [ 0, %entry ], [ %indvars.outermost.next, %for.outermost.latch ]
+  br label %for.middle.header
+
+for.cond.cleanup:                                 ; preds = %for.outermost.latch
+  ret void
+
+for.middle.header:                              ; preds = %for.outermost.header, %for.middle.latch
+  %indvars.middle = phi i64 [ 0, %for.outermost.header ], [ %indvars.middle.next, %for.middle.latch ]
+  br label %for.innermost
+
+for.outermost.latch:                                ; preds = %for.middle.latch
+  %indvars.outermost.next = add nuw nsw i64 %indvars.outermost, 1
+  %exitcond.outermost = icmp ne i64 %indvars.outermost.next, 10
+  br i1 %exitcond.outermost, label %for.outermost.header, label %for.cond.cleanup
+
+for.middle.latch:                                ; preds = %for.innermost
+  %indvars.middle.next = add nuw nsw i64 %indvars.middle, 1
+  %exitcond.middle = icmp ne i64 %indvars.middle.next, 10
+  br i1 %exitcond.middle, label %for.middle.header, label %for.outermost.latch
+
+for.innermost:                                        ; preds = %for.middle.header, %for.innermost
+  %indvars.innermost = phi i64 [ 0, %for.middle.header ], [ %indvars.innermost.next, %for.innermost ]
+  %arrayidx12 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %e, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
+  %0 = load i32, i32* %arrayidx12
+  %arrayidx18 = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* %f, i64 %indvars.innermost, i64 %indvars.middle, i64 %indvars.outermost
+  store i32 %0, i32* %arrayidx18
+  %indvars.innermost.next = add nuw nsw i64 %indvars.innermost, 1
+  %exitcond.innermost = icmp ne i64 %indvars.innermost.next, 10
+  br i1 %exitcond.innermost, label %for.innermost, label %for.middle.latch
+}
\ No newline at end of file