[llvm] [AMDGPU][True16] added Pre-RA hint to improve copy elimination (PR #103366)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 07:20:03 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/103366
>From 889b4cb3616b8acb49275cd62b4a9f9e04a75140 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 13 Aug 2024 12:32:10 -0400
Subject: [PATCH] [AMDGPU][True16] add PreRA hint to improve elimination for
16bit and 32bit register copy
---
.../Target/AMDGPU/GCNPreRAOptimizations.cpp | 43 ++++++++++++
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 67 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 12 ++++
llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 4 +-
4 files changed, 123 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f008f70a6c3d..1a00b5a846b2f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,12 +22,22 @@
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+///
//===----------------------------------------------------------------------===//
#include "GCNPreRAOptimizations.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (!ST.useRealTrue16Insts())
+ return Changed;
+
+ // Add RA hints to improve True16 COPY elimination.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+ }
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 71c720ed09b5f..8caec32aacc8c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3652,6 +3652,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ switch (Hint.first) {
+ case AMDGPURI::Size32: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys =
+ getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+ &AMDGPU::VGPR_32RegClass);
+ }
+
+ // Prefer the paired physreg.
+ if (PairedPhys)
+ // isLo(Paired) is implicitly true here from the API of
+ // getMatchingSuperReg.
+ Hints.push_back(PairedPhys);
+ return false;
+ }
+ case AMDGPURI::Size16: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys)
+ Hints.push_back(PairedPhys);
+ else {
+ // Add all the lo16 physregs.
+ // When the Paired operand has not yet been assigned a physreg it is
+ // better to try putting VirtReg in a lo16 register, because possibly
+ // later Paired can be assigned to the overlapping register and the COPY
+ // can be eliminated.
+ for (MCPhysReg PhysReg : Order) {
+ if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
+ continue;
+ if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+ !MRI.isReserved(PhysReg))
+ Hints.push_back(PhysReg);
+ }
+ }
+ return false;
+ }
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
+}
+
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a434efb70d052..43288652e3aa7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index a94f27a0332c7..fc3624cdfe118 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-SDAG-NEXT: s_endpgm
;
More information about the llvm-commits
mailing list