[llvm] [MachineLICM] Fix regression introduced by d4b8b72 (PR #95746)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 17 00:41:02 PDT 2024
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/95746
None
>From 63224bb5b28f01fbda49f44e8733a5bd2710c640 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 17 Jun 2024 09:40:14 +0200
Subject: [PATCH] [MachineLICM] Fix regression introduced by d4b8b72
---
llvm/lib/CodeGen/MachineLICM.cpp | 45 ++++++++++-------------
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 4 +-
2 files changed, 22 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 6c5170e918e00..237527a165e85 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,38 +426,33 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
BitVector &RUs,
const uint32_t *Mask) {
- // Iterate over the RegMask raw to avoid constructing a BitVector, which is
- // expensive as it implies dynamically allocating memory.
- //
- // We also work backwards.
+ const unsigned NumRUs = TRI.getNumRegUnits();
+ BitVector RUsInMask(NumRUs);
+ BitVector RUsNotInMask(NumRUs);
const unsigned NumRegs = TRI.getNumRegs();
const unsigned MaskWords = (NumRegs + 31) / 32;
for (unsigned K = 0; K < MaskWords; ++K) {
- // We want to set the bits that aren't in RegMask, so flip it.
- uint32_t Word = ~Mask[K];
-
- // Iterate all set bits, starting from the right.
- while (Word) {
- const unsigned SetBitIdx = countr_zero(Word);
-
- // The bits are numbered from the LSB in each word.
- const unsigned PhysReg = (K * 32) + SetBitIdx;
-
- // Clear the bit at SetBitIdx. Doing it this way appears to generate less
- // instructions on x86. This works because negating a number will flip all
- // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
- // faster.
- Word ^= Word & -Word;
-
+ uint32_t Word = Mask[K];
+ for (unsigned Bit = 0; Bit < 32; ++Bit) {
+ const unsigned PhysReg = (K * 32) + Bit;
if (PhysReg == NumRegs)
- return;
+ break;
- if (PhysReg) {
- for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
- RUs.set(*RUI);
- }
+ if (!PhysReg)
+ continue;
+
+ // Extract the bit and apply it to the appropriate mask.
+ auto &Mask = ((Word >> Bit) & 1) ? RUsInMask : RUsNotInMask;
+ for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
+ Mask.set(*RUI);
}
}
+
+ // If a RU needs to be set because it's not in the RegMask, only set it
+ // if all registers from that RU are not in the mask either.
+ RUsNotInMask &= RUsInMask.flip();
+
+ RUs |= RUsNotInMask;
}
/// Examine the instruction for potentai LICM candidate. Also
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index da8aa54469835..7799b9509ceb0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: v_writelane_b32 v40, s62, 30
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[6:7], exec
+; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
-; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
; GISEL-NEXT: s_mov_b64 s[6:7], exec
+; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
More information about the llvm-commits
mailing list