[llvm] [MachinePipeliner] Limit the number of instructions to reduce compile time (PR #80840)

Tue Feb 6 06:38:29 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Yuta Mukai (ytmukai)

<details>
<summary>Changes</summary>

It takes more than O(#instructions^2) time to create DDG, so change to avoid pipelining when the number of instructions is large. The default maximum number of instructions is 200.

The table below shows the time taken for MachinePipeliner pass and its percentage of the total, measured on a server processor by -ftime-trace.

| #instructions | time (ms) | % of total |
|--------------:|----------:|-----------:|
|           100 |         7 |        14% |
|           200 |        32 |        31% |
|           400 |       234 |        49% |
|           800 |     2,114 |        69% |

The test program is as follows, which has four load stores per line.

```
void f(int n, int *a) {
  for (int i=0; i<n; i++) {
    a[a[i]] = a[a[i+10]];
    a[a[i]] = a[a[i+10]];
    ...
  }
}
```

---
Full diff: https://github.com/llvm/llvm-project/pull/80840.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+24) 
- (added) llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir (+79) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 2d2d0bffe2169..897917264d094 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -108,6 +108,8 @@ STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
 STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
 STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
 STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
+STATISTIC(NumFailLargeNumInsts,
+          "Pipeliner abort due to the number of instructions too large");
 STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
 STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
 STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
@@ -123,6 +125,12 @@ static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
                                       cl::desc("Enable SWP at Os."), cl::Hidden,
                                       cl::init(false));
 
+/// A command line argument to limit the number of instructions for pipelining.
+static cl::opt<unsigned> SwpMaxInsts(
+    "pipeliner-max-insts",
+    cl::desc("Maximum number of instructions in a loop for pipeliner."),
+    cl::Hidden, cl::init(200));
+
 /// A command line argument to limit minimum initial interval for pipelining.
 static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
                               cl::desc("Size limit for the MII."),
@@ -469,6 +477,22 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
        I != E; ++I, --size)
     ;
 
+  // Suspend before DAG computation if there are too many instructions
+  if (size > SwpMaxInsts) {
+    LLVM_DEBUG(dbgs() << "#Instructions > " << SwpMaxInsts
+                      << ", we don't pipeline large loops\n");
+    NumFailLargeNumInsts++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "schedule",
+                                               L.getStartLoc(), L.getHeader())
+             << "The number of instructions too large: "
+             << ore::NV("#Instructions", size) << " > "
+             << ore::NV("SwpMaxInsts", SwpMaxInsts) << "."
+             << "Refer to -pipeliner-max-insts.";
+    });
+    return false;
+  }
+
   SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
   SMS.schedule();
   SMS.exitRegion();
diff --git a/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir b/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir
new file mode 100644
index 0000000000000..240fc46b7f6a9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-fail-too-many-insts.mir
@@ -0,0 +1,79 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -stats -pipeliner-max-insts=1 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Verify that the limit on the number of instructions works
+# CHECK: 1 pipeliner - Pipeliner abort due to the number of instructions too large
+
+--- |
+  define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
+    %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+    %0 = load float, ptr %lsr.iv9, align 4
+    %add = fadd float %0, 1.000000e+00
+    store float %add, ptr %lsr.iv, align 4
+    %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+    %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
+    %lsr.iv.next = add nsw i64 %lsr.iv11, -1
+    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+
+...
+---
+name:            func
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%7' }
+  - { reg: '$x1', virtual-reg: '%8' }
+  - { reg: '$w2', virtual-reg: '%9' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+    liveins: $x0, $x1, $w2
+
+    %9:gpr32common = COPY $w2
+    %8:gpr64 = COPY $x1
+    %7:gpr64 = COPY $x0
+    dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.1.for.body.preheader:
+    %11:gpr32 = ORRWrs $wzr, %9, 0
+    %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+    %14:fpr32 = FMOVSi 112
+    B %bb.3
+
+  bb.2.for.cond.cleanup:
+    RET_ReallyLR
+
+  bb.3.for.body:
+    successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+
+    %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+    %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+    %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+    early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
+    %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
+    early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
+    %4:gpr64all = COPY %16
+    %5:gpr64all = COPY %12
+    %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+    %6:gpr64all = COPY %17
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/80840