[llvm] [CodeGen] Limit mem ops checks count for reasonable compilation speed (PR #147151)

Sat Jul 5 07:41:26 PDT 2025

https://github.com/ivafanas created https://github.com/llvm/llvm-project/pull/147151

We've got a ~5 hours compilation of HistogramGIFFTMap.cpp file from Firefox project for the custom out-of-tree backend. Time-consuming part is an analysis of thounsand of memory instructions with thousands of memory operands each.

Proposed fix is to limit checks count for memory operands where it is possible to fallback to conservative answer. After fix applied compilation takes ~0.3 sec.

**Details:**

It happens in huge switch construction with ~1000 cases. The root cause is an interaction of `BranchFolder` optimization called inside post-ra `IfCovnerter` pass and `MachineBlockPlacement` pass:

1. `BranchFolder` extracts identical store instruction into block from its predecessors (~ 1000 predecessors).
2. Memory operands are united for extracted store instructions. So, MIR contains 1 block with store  instructions, each one contains ~1000 memory operands.
3. `MachineBlockPlacement` pass makes a decision to tail merge such instructions back into predecessors. So, MIR contains ~1000 blocks with store instructions, each one contains ~1000 memory operands. 

After that, analysis of memory instructions becomes really time-consuming.

In MIR it looks like the following.

MIR before `IfConverter`:

```
bb.2.sw.bb:
; predecessors: %bb.1
  successors: %bb.1019(0x80000000); %bb.1019(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 0
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i, align 4, !alias.scope !4)
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !4)
  IBRANCH %bb.1019

bb.3.sw.bb1:
; predecessors: %bb.1
  successors: %bb.1019(0x80000000); %bb.1019(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2034, align 4, !alias.scope !7)
  STW $r1, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !7)
  IBRANCH %bb.1019

bb.4.sw.bb3:
; predecessors: %bb.1
  successors: %bb.1019(0x80000000); %bb.1019(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 13
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2035, align 4, !alias.scope !10)
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !10)
  IBRANCH %bb.1019

...

bb.1019.return:
; predecessors: %bb.1008, %bb.1007, %bb.1006, ... ; TOO MANY PREDECESSORS

  RETURN
```

After `BranchFolder` run from `IfConverter`:

```
bb.1.sw.bb:
; predecessors: %bb.0
  successors: %bb.109(0x80000000); %bb.109(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 0
  IBRANCH %bb.109

bb.2.sw.bb1:
; predecessors: %bb.0
  successors: %bb.1017(0x80000000); %bb.1017(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2034, align 4, !alias.scope !7)
  STW $r1, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !7)
  IBRANCH %bb.1017

bb.3.sw.bb3:
; predecessors: %bb.0
  successors: %bb.109(0x80000000); %bb.109(100.00%)
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 13
  IBRANCH %bb.109

...

bb.109.return:
; predecessors: %bb.1008, %bb.1007, %bb.1006, ... ; TOO MANY PREDECESSORS
  successors: %bb.1017(0x80000000); %bb.1017(100.00%)
  liveins: $r3, $dr0, $r1
   STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2140, align 4, !alias.scope !325), (store (s8) into %ir.mIsSome.i.i.i2289, align 4, !alias.scope !772), (store (s8) into %ir.mIsSome.i.i.i2288, align 4, !alias.scope !769), (store (s8) into %ir.mIsSome.i.i.i2287, align 4, !alias.scope !766), (store (s8) into %ir.mIsSome.i.i.i2286, align 4, !alias.scope !763), (store (s8) into %ir.mIsSome.i.i.i2285, align 4,  ... ; TOO MANY MEM OPERANDS
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !325), (store (s32) into %ir.agg.result, !alias.scope !772), (store (s32) into %ir.agg.result, !alias.scope !769), (store (s32) into %ir.agg.result, !alias.scope !766), (store (s32) into %ir.agg.result, !alias.scope !763), (store (s32) into %ir.agg.result, !alias.scope !760), (store (s32) into %ir.agg.result, !alias.scope !757), (store (s32) into %ir.agg.result, !alias.scope !754),   ... ; TOO MANY MEM OPERANDS
  RETURN

```

And after `MachineBlockPlacement` pass:

```

bb.17.sw.bb31:
; predecessors: %bb.0
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 360
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2140, align 4, !alias.scope !325), (store (s8) into %ir.mIsSome.i.i.i2289, align 4, !alias.scope !772), ... ; TOO MANY MEM OPERANDS
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !325), (store (s32) into %ir.agg.result, !alias.scope !772), ... ; TOO MANY MEM OPERANDS
  RETURN

bb.84.sw.bb165:
; predecessors: %bb.0
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 578
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2140, align 4, !alias.scope !325), (store (s8) into %ir.mIsSome.i.i.i2289, align 4, !alias.scope !772), ... ; TOO MANY MEM OPERANDS
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !325), (store (s32) into %ir.agg.result, !alias.scope !772) ... ; TOO MANY MEM OPERANDS
  RETURN

bb.85.sw.bb167:
; predecessors: %bb.0
  liveins: $dr0
  $r1 = ADDs 0, 1
  $r3 = ADDs 0, 581
  STB $r1, $dr0, 4 :: (store (s8) into %ir.mIsSome.i.i.i2140, align 4, !alias.scope !325), (store (s8) into %ir.mIsSome.i.i.i2289, align 4, !alias.scope !772), ... ; TOO MANY MEM OPERANDS
  STW $r3, $dr0, 0 :: (store (s32) into %ir.agg.result, !alias.scope !325), (store (s32) into %ir.agg.result, !alias.scope !772), ... ; TOO MANNY MEM OPERANDS
  RETURN

...

```

Seems like the issue is related only to backends which uses post-ra `IfConverter` pass. It affects PowerPC, Hexagon, SystemZ and AMDGPU.

We would like to share fix with community if it is ok.

>From 309efd2074f667089694789cef98a71aacc42044 Mon Sep 17 00:00:00 2001
From: Ivan Afanasyev <ivafanas at gmail.com>
Date: Sat, 5 Jul 2025 21:02:28 +0700
Subject: [PATCH] [CodeGen] Limit mem operands checks count for reasonable
 compilation speed.

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 15 +++++++++++++++
 llvm/lib/CodeGen/MachineInstr.cpp           | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index b5b83c7ff1164..5a001b7083945 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2057,6 +2057,21 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// overhead or too rigid restriction.
   virtual unsigned getMemOperandAACheckLimit() const { return 16; }
 
+  /// Return the maximum number of memory operands to check instruction for
+  /// memory-related properties.
+  ///
+  /// After MIR transformations like tail merging etc. memory operands are
+  /// united for the merged result instructions. Compiler might ends up with
+  /// thousands of memory operands for each instruction for tricky CFGs like
+  /// for switch construction.
+  ///
+  /// Even linear algorithms on instructions with thousands of memory operands
+  /// leads to significant compilation slowdown.
+  ///
+  /// Heuristic is designed to limit checks count for algorithms where
+  /// conservative answer like "I don't know" is possible.
+  virtual unsigned getMemOperandLinearCheckLimit() const { return 16; }
+
   /// Return an array that contains the ids of the target indices (used for the
   /// TargetIndex machine operand) and their names.
   ///
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index da3665b3b6a0b..f93018dc8b94b 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1581,6 +1581,12 @@ bool MachineInstr::hasOrderedMemoryRef() const {
   if (memoperands_empty())
     return true;
 
+  // Conservatively skip analysis if there are too many memory operands. Keep
+  // compilation time reasonable.
+  const TargetInstrInfo *TII = getMF()->getSubtarget().getInstrInfo();
+  if (getNumMemOperands() > TII->getMemOperandLinearCheckLimit())
+    return true;
+
   // Check if any of our memory operands are ordered.
   return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) {
     return !MMO->isUnordered();
@@ -1600,7 +1606,15 @@ bool MachineInstr::isDereferenceableInvariantLoad() const {
   if (memoperands_empty())
     return false;
 
-  const MachineFrameInfo &MFI = getParent()->getParent()->getFrameInfo();
+  const MachineFunction &MF = *getMF();
+
+  // Conservatively skip analysis if there are too many memory operands. Keep
+  // compilation time reasonable.
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  if (getNumMemOperands() > TII->getMemOperandLinearCheckLimit())
+    return false;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   for (MachineMemOperand *MMO : memoperands()) {
     if (!MMO->isUnordered())