[llvm] [CodeGen][MachineLICM] Use RegUnits in HoistRegionPostRA (PR #94608)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 11 02:12:47 PDT 2024


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/94608

>From 310deea0d4652100c55e9047a55b1caf56cb5318 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 6 Jun 2024 14:40:07 +0200
Subject: [PATCH 1/3] [CodeGen][MachineLICM] Use RegUnits  in 
 HoistRegionPostRA

Those bitvectors  get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive.

We can move all liveness calculations to use RegUnits instead to speed it up.
---
 llvm/lib/CodeGen/MachineLICM.cpp          | 114 ++++++++++++++--------
 llvm/test/CodeGen/AMDGPU/indirect-call.ll |   4 +-
 2 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 86eb259c09015..8426218b3f9fc 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -223,8 +223,8 @@ namespace {
     void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
                      MachineBasicBlock *CurPreheader);
 
-    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &RUDefs,
+                   BitVector &RUClobbers, SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates,
                    MachineLoop *CurLoop);
 
@@ -423,10 +423,23 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
+static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo *TRI, BitVector &RUs, const uint32_t *Mask) {
+  // FIXME: Use RUs better here
+  BitVector MaskedRegs(TRI->getNumRegs());
+  MaskedRegs.setBitsNotInMask(Mask);
+  for(const auto &Set: MaskedRegs.set_bits()) {
+    if (!Set)
+      continue;
+
+    for (MCRegUnitIterator RUI(Set, TRI); RUI.isValid(); ++RUI)
+      RUs.set(*RUI);
+  }
+}
+
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
-void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                                BitVector &PhysRegClobbers,
+void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
+                                BitVector &RUClobbers,
                                 SmallSet<int, 32> &StoredFIs,
                                 SmallVectorImpl<CandidateInfo> &Candidates,
                                 MachineLoop *CurLoop) {
@@ -448,7 +461,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // We can't hoist an instruction defining a physreg that is clobbered in
     // the loop.
     if (MO.isRegMask()) {
-      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      applyBitsNotInRegMaskToRegUnitsMask(TRI, RUClobbers, MO.getRegMask());
       continue;
     }
 
@@ -460,16 +473,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     assert(Reg.isPhysical() && "Not expecting virtual register!");
 
     if (!MO.isDef()) {
-      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
         // If it's using a non-loop-invariant register, then it's obviously not
         // safe to hoist.
-        HasNonInvariantUse = true;
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI))
+          HasNonInvariantUse = true;
+      }
       continue;
     }
 
     if (MO.isImplicit()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        PhysRegClobbers.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        RUClobbers.set(*RUI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -488,19 +503,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
-      if (PhysRegDefs.test(*AS))
-        PhysRegClobbers.set(*AS);
+    for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+        RuledOut = true;
+      } else if (RUClobbers.test(*RUI)) {
+        // MI defined register is seen defined by another instruction in
+        // the loop, it cannot be a LICM candidate.
+        RuledOut = true;
+      }
+
+      RUDefs.set(*RUI);
     }
-    // Need a second loop because MCRegAliasIterator can visit the same
-    // register twice.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
-      PhysRegDefs.set(*AS);
-
-    if (PhysRegClobbers.test(Reg))
-      // MI defined register is seen defined by another instruction in
-      // the loop, it cannot be a LICM candidate.
-      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -521,9 +535,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
   if (!Preheader)
     return;
 
-  unsigned NumRegs = TRI->getNumRegs();
-  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
-  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  BitVector RUDefs(NumRegUnits); // RUs defined once in the loop.
+  BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
 
   SmallVector<CandidateInfo, 32> Candidates;
   SmallSet<int, 32> StoredFIs;
@@ -540,22 +554,23 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     // FIXME: That means a reload that're reused in successor block(s) will not
     // be LICM'ed.
     for (const auto &LI : BB->liveins()) {
-      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
-        PhysRegDefs.set(*AI);
+      for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
+        RUDefs.set(*RUI);
     }
 
     // Funclet entry blocks will clobber all registers
-    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
-      PhysRegClobbers.setBitsNotInMask(Mask);
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) {
+      applyBitsNotInRegMaskToRegUnitsMask(TRI, RUClobbers, Mask);
+    }
 
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
-      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
+      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates,
                 CurLoop);
   }
 
   // Gather the registers read / clobbered by the terminator.
-  BitVector TermRegs(NumRegs);
+  BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
   if (TI != Preheader->end()) {
     for (const MachineOperand &MO : TI->operands()) {
@@ -564,8 +579,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        TermRegs.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        TermRUs.set(*RUI);
     }
   }
 
@@ -583,24 +598,37 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       continue;
 
     unsigned Def = Candidate.Def;
-    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
-      bool Safe = true;
-      MachineInstr *MI = Candidate.MI;
-      for (const MachineOperand &MO : MI->all_uses()) {
-        if (!MO.getReg())
-          continue;
-        Register Reg = MO.getReg();
-        if (PhysRegDefs.test(Reg) ||
-            PhysRegClobbers.test(Reg)) {
+    bool Safe = true;
+    for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
+      if(RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
+        Safe = false;
+        break;
+      }
+    }
+
+    if (!Safe)
+      continue;
+
+    MachineInstr *MI = Candidate.MI;
+    for (const MachineOperand &MO : MI->all_uses()) {
+      if (!MO.getReg())
+        continue;
+      for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
+        if (RUDefs.test(*RUI) ||
+            RUClobbers.test(*RUI)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;
           break;
         }
       }
-      if (Safe)
-        HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
+
+      if (!Safe)
+        break;
     }
+
+    if (Safe)
+      HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb0..da8aa54469835 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]

>From c88667d6bfd635bf43f54d7195a4850956f7934e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 6 Jun 2024 14:41:36 +0200
Subject: [PATCH 2/3] clang-format

---
 llvm/lib/CodeGen/MachineLICM.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 8426218b3f9fc..d802867f1fe8e 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -223,8 +223,8 @@ namespace {
     void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
                      MachineBasicBlock *CurPreheader);
 
-    void ProcessMI(MachineInstr *MI, BitVector &RUDefs,
-                   BitVector &RUClobbers, SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
+                   SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates,
                    MachineLoop *CurLoop);
 
@@ -423,11 +423,13 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
-static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo *TRI, BitVector &RUs, const uint32_t *Mask) {
+static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo *TRI,
+                                                BitVector &RUs,
+                                                const uint32_t *Mask) {
   // FIXME: Use RUs better here
   BitVector MaskedRegs(TRI->getNumRegs());
   MaskedRegs.setBitsNotInMask(Mask);
-  for(const auto &Set: MaskedRegs.set_bits()) {
+  for (const auto &Set : MaskedRegs.set_bits()) {
     if (!Set)
       continue;
 
@@ -536,7 +538,7 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     return;
 
   unsigned NumRegUnits = TRI->getNumRegUnits();
-  BitVector RUDefs(NumRegUnits); // RUs defined once in the loop.
+  BitVector RUDefs(NumRegUnits);     // RUs defined once in the loop.
   BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
 
   SmallVector<CandidateInfo, 32> Candidates;
@@ -565,8 +567,7 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
 
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
-      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates,
-                CurLoop);
+      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
   }
 
   // Gather the registers read / clobbered by the terminator.
@@ -600,7 +601,7 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     unsigned Def = Candidate.Def;
     bool Safe = true;
     for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
-      if(RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
+      if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
         Safe = false;
         break;
       }
@@ -614,8 +615,7 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       if (!MO.getReg())
         continue;
       for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
-        if (RUDefs.test(*RUI) ||
-            RUClobbers.test(*RUI)) {
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;

>From 443b9962c481ed06979e3f7e695613ea87c66d85 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 11 Jun 2024 10:59:51 +0200
Subject: [PATCH 3/3] Optimize BitVector usage, address comments

---
 llvm/lib/CodeGen/MachineLICM.cpp | 40 +++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d802867f1fe8e..39cfa27aad86f 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,15 +426,37 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo *TRI,
                                                 BitVector &RUs,
                                                 const uint32_t *Mask) {
-  // FIXME: Use RUs better here
-  BitVector MaskedRegs(TRI->getNumRegs());
-  MaskedRegs.setBitsNotInMask(Mask);
-  for (const auto &Set : MaskedRegs.set_bits()) {
-    if (!Set)
-      continue;
-
-    for (MCRegUnitIterator RUI(Set, TRI); RUI.isValid(); ++RUI)
-      RUs.set(*RUI);
+  // Iterate over the RegMask raw to avoid constructing a BitVector, which is
+  // expensive as it implies dynamically allocating memory.
+  //
+  // We also work backwards.
+  const unsigned NumRegs = TRI->getNumRegs();
+  const unsigned MaskWords = (NumRegs + 31) / 32;
+  for (unsigned K = 0; K < MaskWords; ++K) {
+    // We want to set the bits that aren't in RegMask, so flip it.
+    uint32_t Word = ~Mask[K];
+
+    // Iterate all set bits, starting from the right.
+    while (Word) {
+      const unsigned SetBitIdx = countr_zero(Word);
+
+      // The bits are numbered from the LSB in each word.
+      const unsigned PhysReg = (K * 32) + SetBitIdx;
+
+      // Clear the bit at SetBitIdx. Doing it this way appears to generate less
+      // instructions on x86. This works because negating a number will flip all
+      // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
+      // faster.
+      Word ^= Word & -Word;
+
+      if (PhysReg == NumRegs)
+        return;
+
+      if (PhysReg) {
+        for (MCRegUnitIterator RUI(PhysReg, TRI); RUI.isValid(); ++RUI)
+          RUs.set(*RUI);
+      }
+    }
   }
 }
 



More information about the llvm-commits mailing list