[llvm] [GlobalISel][Localizer] Allow localization of a small number of repeated phi uses. (PR #77566)
Amara Emerson via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 11 02:56:26 PST 2024
https://github.com/aemerson updated https://github.com/llvm/llvm-project/pull/77566
>From 329c60645e4622354ca9f764a6d5853e82f37256 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Mon, 8 Jan 2024 20:38:57 -0800
Subject: [PATCH 1/3] [GlobalISel][Localizer] Allow localization of a small
number of repeated phi uses.
We previously had a heuristic that if a value V was used multiple times in a
single PHI, then to avoid potentially rematerializing into many predecessors we
bail out. the phi uses only counted as a single use in the shouldLocalize() hook
because it counted the PHI as a single instruction use, not factoring in it may
have many incoming edges.
It turns out this heuristic is slightly too pessiistic, and allowing a small number
of these use uses to be localized can improve code size due to shortening live ranges,
especially if those ranges span a call.
This change results in some improvements in size on CTMark -Os:
Program size.__text
before after diff
kimwitu++/kc 451676.00 451860.00 0.0%
mafft/pairlocalalign 241460.00 241540.00 0.0%
tramp3d-v4/tramp3d-v4 389216.00 389208.00 -0.0%
7zip/7zip-benchmark 587528.00 587464.00 -0.0%
Bullet/bullet 457424.00 457348.00 -0.0%
consumer-typeset/consumer-typeset 405472.00 405376.00 -0.0%
SPASS/SPASS 410288.00 410120.00 -0.0%
lencod/lencod 426396.00 426108.00 -0.1%
ClamAV/clamscan 380108.00 379756.00 -0.1%
sqlite3/sqlite3 283664.00 283372.00 -0.1%
Geomean difference -0.0%
I experimented with different variations and thresholds. Using 3 instead of 2 resulted in
a further 0.1% improvement on ClamAV but also regressed sqlite3 by the same %.
---
.../llvm/CodeGen/GlobalISel/Localizer.h | 5 +-
llvm/lib/CodeGen/GlobalISel/Localizer.cpp | 46 ++++++++-------
.../AArch64/GlobalISel/invoke-region.ll | 16 +++---
.../irtranslator-hoisted-constants.ll | 10 ++--
.../CodeGen/AArch64/GlobalISel/localizer.mir | 56 ++++++++++++++++---
...-divergent-i1-phis-no-lane-mask-merging.ll | 24 ++++----
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 10 ++--
.../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll | 2 +-
.../CodeGen/AMDGPU/GlobalISel/localizer.ll | 2 +-
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 10 ++--
.../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 10 ++--
11 files changed, 118 insertions(+), 73 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
index b1fcdd207a60a1..3f4165acb85be0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -67,10 +67,7 @@ class Localizer : public MachineFunctionPass {
typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;
- /// If \p Op is a phi operand and not unique in that phi, that is,
- /// there are other operands in the phi with the same register,
- /// return true.
- bool isNonUniquePhiValue(MachineOperand &Op) const;
+ unsigned getNumPhiUses(MachineOperand &Op) const;
/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 55984423e5bc63..a83c57e627ca55 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -58,18 +58,19 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}
-bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
+unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
MachineInstr *MI = Op.getParent();
if (!MI->isPHI())
- return false;
+ return 0;
Register SrcReg = Op.getReg();
+ unsigned NumUses = 0;
for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
auto &MO = MI->getOperand(Idx);
if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
- return true;
+ ++NumUses;
}
- return false;
+ return NumUses;
}
bool Localizer::localizeInterBlock(MachineFunction &MF,
@@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
continue;
}
- // If the use is a phi operand that's not unique, don't try to localize.
- // If we do, we can cause unnecessary instruction bloat by duplicating
- // into each predecessor block, when the existing one is sufficient and
- // allows for easier optimization later.
- if (isNonUniquePhiValue(MOUse))
+ // PHIs look like a single user but can use the same register in multiple
+ // edges, causing remat into each predecessor. Allow this to a certain
+ // extent.
+ unsigned NumPhiUses = getNumPhiUses(MOUse);
+ const unsigned PhiThreshold = 2; // FIXME: Tune this more.
+ if (NumPhiUses > PhiThreshold)
continue;
LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
@@ -164,19 +166,23 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
if (!UseMI.isPHI())
Users.insert(&UseMI);
}
- // If all the users were PHIs then they're not going to be in our block,
- // don't try to move this instruction.
- if (Users.empty())
- continue;
-
MachineBasicBlock::iterator II(MI);
- ++II;
- while (II != MBB.end() && !Users.count(&*II))
+ // If all the users were PHIs then they're not going to be in our block, we
+ // may still benefit from sinking, especially since the value might be live
+ // across a call.
+ if (Users.empty()) {
+ // Make sure we don't sink in between
+ // two terminator sequences by scanning forward, not backward.
+ II = MBB.getFirstTerminatorForward();
+ LLVM_DEBUG(dbgs() << "Only phi users: moving " << *MI << " to the end\n");
+ } else {
++II;
-
- assert(II != MBB.end() && "Didn't find the user in the MBB");
- LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
- << '\n');
+ while (II != MBB.end() && !Users.count(&*II))
+ ++II;
+ assert(II != MBB.end() && "Didn't find the user in the MBB");
+ LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
+ << '\n');
+ }
MI->removeFromParent();
MBB.insert(II, MI);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
index 007e1fb3d63dad..39ad002a0763f9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
@@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
- ; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
@@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
@@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: liveins: $x0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
- ; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr)
; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8)
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
@@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
index 5867326c18aa6c..1602480ea3e0da 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
@@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) {
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
- ; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
- ; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
- ; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
+ ; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
+ ; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+ ; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3
; PRESELECTION-NEXT: G_BR %bb.2
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: bb.2.common.ret:
- ; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1
+ ; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1
; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32)
; PRESELECTION-NEXT: RET_ReallyLR implicit $w0
; PRESELECTION-NEXT: {{ $}}
@@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) {
; POSTSELECTION-NEXT: {{ $}}
; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
- ; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000
+ ; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3
; POSTSELECTION-NEXT: B %bb.2
; POSTSELECTION-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
index 90580c847f290d..942844e0d04440 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
@@ -56,7 +56,9 @@
define void @test_inttoptr() { ret void }
define void @many_local_use_intra_block() { ret void }
- define void @non_local_phi_use_nonunique() { ret void }
+ define void @non_local_phi_single_use() { ret void }
+ define void @non_local_phi_three_uses() { ret void }
+
...
---
@@ -285,8 +287,8 @@ body: |
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1
- ; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]]
+ ; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: G_BR %bb.1
; Existing registers should be left untouched
@@ -566,12 +568,12 @@ body: |
...
---
-name: non_local_phi_use_nonunique
+name: non_local_phi_single_use
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
- ; CHECK-LABEL: name: non_local_phi_use_nonunique
+ ; CHECK-LABEL: name: non_local_phi_single_use
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
@@ -582,12 +584,12 @@ body: |
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: bb.2:
- ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
+ ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
- ; Don't localize the 1 into bb.1, because there are multiple edges
- ; using that register.
+ ; Localize the 1 into bb.1, since the number of uses is under the threshold.
bb.0:
successors: %bb.1, %bb.2
@@ -606,3 +608,43 @@ body: |
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
+---
+name: non_local_phi_three_uses
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: non_local_phi_three_uses
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
+ ; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]]
+ ; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]]
+ ; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32)
+ ; CHECK: G_BRCOND %cond(s1), %bb.1
+ ; CHECK: G_BR %bb.2
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: bb.2:
+ ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
+ ; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
+
+ ; Don't localize the 1 into bb.1, above the thresold of uses in the phi.
+
+ bb.0:
+ successors: %bb.1, %bb.2
+
+ %0:gpr(s32) = G_CONSTANT i32 1
+ %1:gpr(s32) = G_ADD %0, %0
+ %cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0
+ %cond:gpr(s1) = G_TRUNC %cmp(s32)
+ G_BRCOND %cond(s1), %bb.1
+ G_BR %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ bb.2:
+ %3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0
+ %2:gpr(s32) = G_ADD %3, %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 4ac1fad6deecdc..7a68aec1a1c555 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s2, 1
-; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
+; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $vgpr3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v3, s12
+; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
-; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
+; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_mov_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 043e69abaeef2d..a2c762d044b3f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
-; GCN-NEXT: s_cselect_b32 s2, 1, 0
+; GCN-NEXT: s_cselect_b32 s4, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_mov_b32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: .LBB2_2: ; %Flow
-; GCN-NEXT: s_xor_b32 s2, s4, 1
+; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index a4d5fe4ffa5a75..4d4da869d7507e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
; LOOP-NEXT: s_cbranch_execz .LBB0_6
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
-; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: s_mov_b32 s6, 0
; LOOP-NEXT: s_mov_b32 s7, 0xf000
; LOOP-NEXT: s_mov_b64 s[4:5], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2b..36bac87889cacd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -31,9 +31,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: .LBB0_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s0, s0, 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 3add708d1a6394..887c43f5fce59e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -186,12 +186,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_udiv_i64:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
+; CHECK-NEXT: s_mov_b32 s6, 0
+; CHECK-NEXT: s_mov_b32 s7, -1
+; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s4, 1
-; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3]
-; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s9, -1
-; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
-; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 12df4b7c7fc33d..5c6bb6dea16466 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -183,12 +183,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_urem_i64:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
+; CHECK-NEXT: s_mov_b32 s6, 0
+; CHECK-NEXT: s_mov_b32 s7, -1
+; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s4, 1
-; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3]
-; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s9, -1
-; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
-; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
>From 2795d19477aa322d6ad6abdb54ed43879fb9d718 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Wed, 10 Jan 2024 06:25:38 -0800
Subject: [PATCH 2/3] Address comments & add a GPhi wrapper to simplify PHI
iteration code.
---
.../CodeGen/GlobalISel/GenericMachineInstrs.h | 18 +++++++++++++++++
.../llvm/CodeGen/GlobalISel/Localizer.h | 2 ++
llvm/lib/CodeGen/GlobalISel/Localizer.cpp | 20 +++++++++----------
3 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 6ab1d4550c51ca..14885d5f9d08ee 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -558,6 +558,24 @@ class GVecReduce : public GenericMachineInstr {
}
};
+/// Represents a G_PHI.
+class GPhi : public GenericMachineInstr {
+public:
+ /// Returns the number of incoming values.
+ unsigned getNumIncomingValues() const { return (getNumOperands() - 1) / 2; }
+ /// Returns the I'th incoming vreg.
+ Register getIncomingValue(unsigned I) {
+ return getOperand(I * 2 + 1).getReg();
+ }
+ /// Returns the I'th incoming basic block.
+ MachineBasicBlock *getIncomingBlock(unsigned I) {
+ return getOperand(I * 2 + 2).getMBB();
+ }
+
+ static bool classof(const MachineInstr *MI) {
+ return MI->getOpcode() == TargetOpcode::G_PHI;
+ }
+};
} // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
index 3f4165acb85be0..4fbff4d10f8ab5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -67,6 +67,8 @@ class Localizer : public MachineFunctionPass {
typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;
+ /// If \p Op is a reg operand of a PHI, return the number of total
+ /// operands in the PHI that are the same as \p Op, including itself.
unsigned getNumPhiUses(MachineOperand &Op) const;
/// Do inter-block localization from the entry block.
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index a83c57e627ca55..75e2d92e98c07a 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -13,6 +13,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -59,17 +60,15 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
}
unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
- MachineInstr *MI = Op.getParent();
- if (!MI->isPHI())
+ auto *MI = dyn_cast<GPhi>(&*Op.getParent());
+ if (!MI)
return 0;
Register SrcReg = Op.getReg();
unsigned NumUses = 0;
- for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
- auto &MO = MI->getOperand(Idx);
- if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
+ for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I)
+ if (MI->getIncomingValue(I) == SrcReg)
++NumUses;
- }
return NumUses;
}
@@ -171,17 +170,16 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
// may still benefit from sinking, especially since the value might be live
// across a call.
if (Users.empty()) {
- // Make sure we don't sink in between
- // two terminator sequences by scanning forward, not backward.
+ // Make sure we don't sink in between two terminator sequences by scanning
+ // forward, not backward.
II = MBB.getFirstTerminatorForward();
- LLVM_DEBUG(dbgs() << "Only phi users: moving " << *MI << " to the end\n");
+ LLVM_DEBUG(dbgs() << "Only phi users: moving inst to end: " << *MI);
} else {
++II;
while (II != MBB.end() && !Users.count(&*II))
++II;
assert(II != MBB.end() && "Didn't find the user in the MBB");
- LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
- << '\n');
+ LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II);
}
MI->removeFromParent();
>From 2b46890eec04e8f8b181e559d967fd25c916c904 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 11 Jan 2024 02:56:08 -0800
Subject: [PATCH 3/3] braces
---
llvm/lib/CodeGen/GlobalISel/Localizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 75e2d92e98c07a..ae58e135931f42 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -66,9 +66,10 @@ unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
Register SrcReg = Op.getReg();
unsigned NumUses = 0;
- for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I)
+ for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I) {
if (MI->getIncomingValue(I) == SrcReg)
++NumUses;
+ }
return NumUses;
}
More information about the llvm-commits
mailing list