[llvm] 770393b - [MachineLICM] Correctly Apply Register Masks (#95746)

Mon Jun 17 04:42:04 PDT 2024

Author: Pierre van Houtryve
Date: 2024-06-17T13:42:00+02:00
New Revision: 770393bb99d947b908031f83e2d064b9666740e4

URL: https://github.com/llvm/llvm-project/commit/770393bb99d947b908031f83e2d064b9666740e4
DIFF: https://github.com/llvm/llvm-project/commit/770393bb99d947b908031f83e2d064b9666740e4.diff

LOG: [MachineLICM] Correctly Apply Register Masks (#95746)

Fix regression introduced in d4b8b72

Added: 
    

Modified: 
    llvm/lib/CodeGen/MachineLICM.cpp
    llvm/test/CodeGen/AMDGPU/indirect-call.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 6c5170e918e00..1c76d72ed5152 100644

--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,38 +426,29 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
                                                 BitVector &RUs,
                                                 const uint32_t *Mask) {
-  // Iterate over the RegMask raw to avoid constructing a BitVector, which is
-  // expensive as it implies dynamically allocating memory.
-  //
-  // We also work backwards.
+  BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
   const unsigned NumRegs = TRI.getNumRegs();
   const unsigned MaskWords = (NumRegs + 31) / 32;
   for (unsigned K = 0; K < MaskWords; ++K) {
-    // We want to set the bits that aren't in RegMask, so flip it.
-    uint32_t Word = ~Mask[K];
-
-    // Iterate all set bits, starting from the right.
-    while (Word) {
-      const unsigned SetBitIdx = countr_zero(Word);
-
-      // The bits are numbered from the LSB in each word.
-      const unsigned PhysReg = (K * 32) + SetBitIdx;
-
-      // Clear the bit at SetBitIdx. Doing it this way appears to generate less
-      // instructions on x86. This works because negating a number will flip all
-      // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
-      // faster.
-      Word ^= Word & -Word;
+    const uint32_t Word = Mask[K];
+    if (!Word)
+      continue;
 
+    for (unsigned Bit = 0; Bit < 32; ++Bit) {
+      const unsigned PhysReg = (K * 32) + Bit;
       if (PhysReg == NumRegs)
-        return;
+        break;
 
-      if (PhysReg) {
+      // Check if we have a valid PhysReg that is set in the mask.
+      // FIXME: We shouldn't have to check for PhysReg.
+      if (PhysReg && ((Word >> Bit) & 1)) {
         for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
-          RUs.set(*RUI);
+          ClobberedRUs.reset(*RUI);
       }
     }
   }
+
+  RUs |= ClobberedRUs;
 }
 
 /// Examine the instruction for potentai LICM candidate. Also

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index da8aa54469835..7799b9509ceb0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]