[llvm] [RegAllocFast] fold foldable inline asm (PR #74344)

Nick Desaulniers via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 4 09:24:54 PST 2023


https://github.com/nickdesaulniers created https://github.com/llvm/llvm-project/pull/74344

For each MachineBasicBlock, perform a pass that pessimistically folds all
INLINEASM MachineInstrs that have foldable operands. Technically, we shouldn't
have needed to perform such folds unless there's register pressure, but at that
point RegAllocFast isn't really set up to replace the MachineInstr that we're
trying to allocate a physreg for. And if you wanted better codegen, then you
probably want -regalloc=greedy.

The pre-existing spill/reload infrastructure exists to spill outgoing values
and reload incoming values to MachineBasicBlocks.  In our case, we instead want
to create a new stack slot, convert the INLINEASM to use that, manually store
to that slot BEFORE the INLINEASM, then finally load from that slot AFTER the
INLINEASM.


>From c74e7ab11cf1424b94b183c5bc74fc545e05a756 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers at google.com>
Date: Wed, 25 Oct 2023 09:30:50 -0700
Subject: [PATCH] [RegAllocFast] fold foldable inline asm

For each MachineBasicBlock, perform a pass that pessimistically folds all
INLINEASM MachineInstrs that have foldable operands. Technically, we shouldn't
have needed to perform such folds unless there's register pressure, but at that
point RegAllocFast isn't really set up to replace the MachineInstr that we're
trying to allocate a physreg for. And if you wanted better codegen, then you
probably want -regalloc=greedy.

The pre-existing spill/reload infrastructure exists to spill outgoing values
and reload incoming values to MachineBasicBlocks.  In our case, we instead want
to create a new stack slot, convert the INLINEASM to use that, manually store
to that slot BEFORE the INLINEASM, then finally load from that slot AFTER the
INLINEASM.
---
 llvm/lib/CodeGen/RegAllocFast.cpp             | 74 +++++++++++++++++++
 .../MIR/X86/inline-asm-rm-exhaustion.mir      | 20 +++++
 2 files changed, 94 insertions(+)

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index b216d72964462..88d71b0b1426c 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -290,6 +290,11 @@ class RegAllocFast : public MachineFunctionPass {
   bool mayLiveOut(Register VirtReg);
   bool mayLiveIn(Register VirtReg);
 
+  // For each register operand marked spillable, spill then memory fold the
+  // operand.
+  void preemptivelySpillInlineAsmOperands(MachineInstr *MI);
+  void preemptivelySpillInlineAsmOperands(MachineBasicBlock *MBB);
+
   void dumpState() const;
 };
 
@@ -1578,6 +1583,12 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
 
   Coalesced.clear();
 
+  // If an operand can be folded just fold it immediately so that we don't
+  // have to later try to deal with register exhaustion if there's register
+  // pressure. Do this before useVirtReg which will attempt to allocate a
+  // physreg and may fail under register pressure.
+  preemptivelySpillInlineAsmOperands(&MBB);
+
   // Traverse block in reverse order allocating instructions one by one.
   for (MachineInstr &MI : reverse(MBB)) {
     LLVM_DEBUG(dbgs() << "\n>> " << MI << "Regs:"; dumpState());
@@ -1626,6 +1637,69 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(MBB.dump());
 }
 
+void RegAllocFast::preemptivelySpillInlineAsmOperands(MachineInstr *MI) {
+  assert(MI->isInlineAsm() && "should only be used by inline asm");
+  // BOTH MI and its number of operands may change in this loop; cache neither.
+  for (unsigned I = InlineAsm::MIOp_FirstOperand; I < MI->getNumOperands();
+       ++I) {
+    MachineOperand &MO = MI->getOperand(I);
+    if (!(MO.isReg() && MI->mayFoldInlineAsmRegOp(I)))
+      continue;
+
+    const bool IsDef = MO.isDef();
+    const bool IsUse = MO.isUse();
+    const bool IsKill = MO.isKill();
+    const MachineOperand *TiedOp = nullptr;
+
+    if (MO.isTied())
+      if (MachineOperand *T = &MI->getOperand(MI->findTiedOperandIdx(I)))
+        if (T->isUse())
+          TiedOp = T;
+
+    Register Reg = MO.getReg();
+    const bool IsVirt = Reg.isVirtual();
+    const TargetRegisterClass *RC =
+        IsVirt ? MRI->getRegClass(Reg) : TRI->getMinimalPhysRegClass(Reg);
+    int FrameIndex = IsVirt
+                         ? getStackSpaceFor(Reg)
+                         : MFI->CreateSpillStackObject(TRI->getSpillSize(*RC),
+                                                       TRI->getSpillAlign(*RC));
+
+    MachineInstr *NewMI = TII->foldMemoryOperand(*MI, {I}, FrameIndex);
+    // foldMemoryOperand always puts the new instruction before its first param.
+    // Here, we want the newly created replacement to go after the old one, so
+    // that we don't attempt to regalloc it again.
+    MI->getParent()->splice(std::next(MI->getIterator()), NewMI->getParent(),
+                            NewMI->getIterator());
+
+    if (IsDef) {
+      // TODO: asm goto
+      TII->loadRegFromStackSlot(*MBB, std::next(NewMI->getIterator()), Reg,
+                                FrameIndex, RC, TRI, {});
+      ++NumStores;
+    }
+
+    if (IsUse || TiedOp) {
+      if (TiedOp)
+        Reg = TiedOp->getReg();
+      TII->storeRegToStackSlot(*MBB, MI, Reg, IsKill, FrameIndex, RC, TRI, {});
+      ++NumStores;
+    }
+
+    MI->removeFromParent();
+    MI = NewMI;
+  }
+}
+
+void RegAllocFast::preemptivelySpillInlineAsmOperands(MachineBasicBlock *MBB) {
+  SmallVector<MachineInstr *> InlineAsms;
+  for (MachineInstr &MI : *MBB)
+    if (MI.isInlineAsm())
+      InlineAsms.push_back(&MI);
+  for (MachineInstr *MI : InlineAsms)
+    preemptivelySpillInlineAsmOperands(MI);
+}
+
 bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
                     << "********** Function: " << MF.getName() << '\n');
diff --git a/llvm/test/CodeGen/MIR/X86/inline-asm-rm-exhaustion.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-rm-exhaustion.mir
index 6ca19aba1a65d..426d63df5dd38 100644
--- a/llvm/test/CodeGen/MIR/X86/inline-asm-rm-exhaustion.mir
+++ b/llvm/test/CodeGen/MIR/X86/inline-asm-rm-exhaustion.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -start-after=finalize-isel -regalloc=greedy -stop-after=greedy \
 # RUN:   -verify-machineinstrs -verify-regalloc %s -o - | FileCheck %s
+# RUN: llc -start-after=finalize-isel -regalloc=fast -stop-after=regallocfast \
+# RUN:   -verify-machineinstrs -verify-regalloc %s -o - | FileCheck %s --check-prefix=FAST
 --- |
   target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
   target triple = "i386-unknown-linux-gnu"
@@ -86,6 +88,10 @@ body:             |
     ; CHECK-LABEL: name: input
     ; CHECK: INLINEASM &"# $0", 8 /* mayload attdialect */, 262190 /* mem:m */, %fixed-stack.0, 1, $noreg, 0, $noreg, 12 /* clobber */, implicit-def dead early-clobber $ax, 12 /* clobber */, implicit-def dead early-clobber $cx, 12 /* clobber */, implicit-def dead early-clobber $dx, 12 /* clobber */, implicit-def dead early-clobber $si, 12 /* clobber */, implicit-def dead early-clobber $di, 12 /* clobber */, implicit-def dead early-clobber $bx, 12 /* clobber */, implicit-def dead early-clobber $bp :: (load (s32) from %fixed-stack.0, align 16)
     ; CHECK-NEXT: RET 0
+    ;
+    ; FAST-LABEL: name: input
+    ; FAST: INLINEASM &"# $0", 8 /* mayload attdialect */, 262190 /* mem:m */, %fixed-stack.0, 1, $noreg, 0, $noreg, 12 /* clobber */, implicit-def dead early-clobber $ax, 12 /* clobber */, implicit-def dead early-clobber $cx, 12 /* clobber */, implicit-def dead early-clobber $dx, 12 /* clobber */, implicit-def dead early-clobber $si, 12 /* clobber */, implicit-def dead early-clobber $di, 12 /* clobber */, implicit-def dead early-clobber $bx, 12 /* clobber */, implicit-def dead early-clobber $bp :: (load (s32) from %fixed-stack.0, align 16)
+    ; FAST-NEXT: RET 0
     %0:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 16)
     INLINEASM &"# $0", 0 /* attdialect */, 1076101129 /* reguse:GR32 spillable */, %0, 12 /* clobber */, implicit-def early-clobber $ax, 12 /* clobber */, implicit-def early-clobber $cx, 12 /* clobber */, implicit-def early-clobber $dx, 12 /* clobber */, implicit-def early-clobber $si, 12 /* clobber */, implicit-def early-clobber $di, 12 /* clobber */, implicit-def early-clobber $bx, 12 /* clobber */, implicit-def early-clobber $bp
     RET 0
@@ -152,6 +158,12 @@ body:             |
     ; CHECK-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, [[MOV32rm]] :: (store (s32) into %ir.1)
     ; CHECK-NEXT: $eax = COPY [[MOV32rm]]
     ; CHECK-NEXT: RET 0, $eax
+    ;
+    ; FAST-LABEL: name: output
+    ; FAST: INLINEASM &"# $0", 16 /* maystore attdialect */, 262190 /* mem:m */, %stack.1, 1, $noreg, 0, $noreg, 12 /* clobber */, implicit-def dead early-clobber $ax, 12 /* clobber */, implicit-def dead early-clobber $cx, 12 /* clobber */, implicit-def dead early-clobber $dx, 12 /* clobber */, implicit-def dead early-clobber $si, 12 /* clobber */, implicit-def dead early-clobber $di, 12 /* clobber */, implicit-def dead early-clobber $bx, 12 /* clobber */, implicit-def dead early-clobber $bp :: (store (s32) into %stack.1)
+    ; FAST-NEXT: renamable $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1)
+    ; FAST-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.1)
+    ; FAST-NEXT: RET 0, killed $eax
     INLINEASM &"# $0", 0 /* attdialect */, 1076101130 /* regdef:GR32 spillable */, def %0, 12 /* clobber */, implicit-def early-clobber $ax, 12 /* clobber */, implicit-def early-clobber $cx, 12 /* clobber */, implicit-def early-clobber $dx, 12 /* clobber */, implicit-def early-clobber $si, 12 /* clobber */, implicit-def early-clobber $di, 12 /* clobber */, implicit-def early-clobber $bx, 12 /* clobber */, implicit-def early-clobber $bp
     MOV32mr %stack.0, 1, $noreg, 0, $noreg, %0 :: (store (s32) into %ir.1)
     $eax = COPY %0
@@ -223,6 +235,14 @@ body:             |
     ; CHECK-NEXT: MOV32mr %fixed-stack.0, 1, $noreg, 0, $noreg, [[MOV32rm1]] :: (store (s32) into %ir.2, align 16)
     ; CHECK-NEXT: $eax = COPY [[MOV32rm1]]
     ; CHECK-NEXT: RET 0, $eax
+    ;
+    ; FAST-LABEL: name: inout
+    ; FAST: renamable $eax = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 16)
+    ; FAST-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %stack.0)
+    ; FAST-NEXT: INLINEASM &"# $0 $1", 24 /* mayload maystore attdialect */, 262190 /* mem:m */, %stack.0, 1, $noreg, 0, $noreg, 262190 /* mem:m */, %stack.0, 1, $noreg, 0, $noreg, 12 /* clobber */, implicit-def dead early-clobber $ax, 12 /* clobber */, implicit-def dead early-clobber $cx, 12 /* clobber */, implicit-def dead early-clobber $dx, 12 /* clobber */, implicit-def dead early-clobber $si, 12 /* clobber */, implicit-def dead early-clobber $di, 12 /* clobber */, implicit-def dead early-clobber $bx, 12 /* clobber */, implicit-def dead early-clobber $bp :: (load store (s32) on %stack.0)
+    ; FAST-NEXT: renamable $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0)
+    ; FAST-NEXT: MOV32mr %fixed-stack.0, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.2, align 16)
+    ; FAST-NEXT: RET 0, killed $eax
     %1:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 16)
     INLINEASM &"# $0 $1", 0 /* attdialect */, 1076101130 /* regdef:GR32 spillable */, def %0, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3), 12 /* clobber */, implicit-def early-clobber $ax, 12 /* clobber */, implicit-def early-clobber $cx, 12 /* clobber */, implicit-def early-clobber $dx, 12 /* clobber */, implicit-def early-clobber $si, 12 /* clobber */, implicit-def early-clobber $di, 12 /* clobber */, implicit-def early-clobber $bx, 12 /* clobber */, implicit-def early-clobber $bp
     MOV32mr %fixed-stack.0, 1, $noreg, 0, $noreg, %0 :: (store (s32) into %ir.2, align 16)



More information about the llvm-commits mailing list