[llvm] [MachinePipeliner] Limit the number of instructions to reduce compile time (PR #80840)

Tue Feb 6 06:37:57 PST 2024

https://github.com/ytmukai created https://github.com/llvm/llvm-project/pull/80840

It takes more than O(#instructions^2) time to create DDG, so change to avoid pipelining when the number of instructions is large. The default maximum number of instructions is 200.

The table below shows the time taken for MachinePipeliner pass and its percentage of the total, measured on a server processor by -ftime-trace.

| #instructions | time (ms) | % of total |
|--------------:|----------:|-----------:|
|           100 |         7 |        14% |
|           200 |        32 |        31% |
|           400 |       234 |        49% |
|           800 |     2,114 |        69% |

The test program is as follows, which has four load stores per line.

```
void f(int n, int *a) {
  for (int i=0; i<n; i++) {
    a[a[i]] = a[a[i+10]];
    a[a[i]] = a[a[i+10]];
    ...
  }
}
```

>From a2baa9d9973fd9198d58053e04abd5df810bd55c Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Wed, 24 Jan 2024 21:29:24 +0900
Subject: [PATCH] [MachinePipeliner] Limit the number of instructions to reduce
 compile time

It takes more than O(#instructions^2) time to create DDG, so change to
avoid pipelining when the number of instructions is large. The default
maximum number of instructions is 200.

The table below shows the time taken for MachinePipeliner pass and its
percentage of the total, measured on a server processor by
-ftime-trace.

| #instructions | time (ms) | % of total |
|--------------:|----------:|-----------:|
|           100 |         7 |        14% |
|           200 |        32 |        31% |
|           400 |       234 |        49% |
|           800 |     2,114 |        69% |

The test program is as follows, which has four load stores per line.

```
void f(int n, int *a) {
  for (int i=0; i<n; i++) {
    a[a[i]] = a[a[i+10]];
    a[a[i]] = a[a[i+10]];
    ...
  }
}
```
---
 llvm/lib/CodeGen/MachinePipeliner.cpp         | 24 ++++++
 .../AArch64/sms-fail-too-many-insts.mir       | 79 +++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 2d2d0bffe2169b..897917264d094f 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -108,6 +108,8 @@ STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
 STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
 STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
 STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
+STATISTIC(NumFailLargeNumInsts,
+          "Pipeliner abort due to the number of instructions too large");
 STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
 STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
@@ -123,6 +125,12 @@ static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
                                       cl::desc("Enable SWP at Os."), cl::Hidden,
                                       cl::init(false));
 
+/// A command line argument to limit the number of instructions for pipelining.
+static cl::opt<unsigned> SwpMaxInsts(
+    "pipeliner-max-insts",
+    cl::desc("Maximum number of instructions in a loop for pipeliner."),
+    cl::Hidden, cl::init(200));
+
 /// A command line argument to limit minimum initial interval for pipelining.
 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
                               cl::desc("Size limit for the MII."),
@@ -469,6 +477,22 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
        I != E; ++I, --size)
     ;
 
+  // Suspend before DAG computation if there are too many instructions
+  if (size > SwpMaxInsts) {
+    LLVM_DEBUG(dbgs() << "#Instructions > " << SwpMaxInsts
+                      << ", we don't pipeline large loops\n");
+    NumFailLargeNumInsts++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "schedule",
+                                               L.getStartLoc(), L.getHeader())
+             << "The number of instructions too large: "
+             << ore::NV("#Instructions", size) << " > "
+             << ore::NV("SwpMaxInsts", SwpMaxInsts) << "."
+             << "Refer to -pipeliner-max-insts.";
+    });
+    return false;
+  }
+
   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
   SMS.schedule();
   SMS.exitRegion();
diff --git a/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir b/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir
new file mode 100644
index 00000000000000..240fc46b7f6a92
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir
@@ -0,0 +1,79 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -stats -pipeliner-max-insts=1 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Verify that the limit on the number of instructions works
+# CHECK: 1 pipeliner - Pipeliner abort due to the number of instructions too large
+
+--- |
+  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
+    %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+    %0 = load float, ptr %lsr.iv9, align 4
+    %add = fadd float %0, 1.000000e+00
+    store float %add, ptr %lsr.iv, align 4
+    %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+    %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
+    %lsr.iv.next = add nsw i64 %lsr.iv11, -1
+    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+...
+---
+name:            func
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%7' }
+  - { reg: '$x1', virtual-reg: '%8' }
+  - { reg: '$w2', virtual-reg: '%9' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+    liveins: $x0, $x1, $w2
+
+    %9:gpr32common = COPY $w2
+    %8:gpr64 = COPY $x1
+    %7:gpr64 = COPY $x0
+    dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.1.for.body.preheader:
+    %11:gpr32 = ORRWrs $wzr, %9, 0
+    %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+    %14:fpr32 = FMOVSi 112
+    B %bb.3
+
+  bb.2.for.cond.cleanup:
+    RET_ReallyLR
+
+  bb.3.for.body:
+    successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+
+    %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+    %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+    %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+    early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
+    %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
+    early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
+    %4:gpr64all = COPY %16
+    %5:gpr64all = COPY %12
+    %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+    %6:gpr64all = COPY %17
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...