[llvm] Co-issue packed instructions by unpacking (PR #151704)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 14:47:38 PDT 2025
================
@@ -260,38 +589,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
Changed |= processReg(Reg);
}
- if (!ST.useRealTrue16Insts())
- return Changed;
-
// Add RA hints to improve True16 COPY elimination.
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- if (MI.getOpcode() != AMDGPU::COPY)
- continue;
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (Dst.isVirtual() &&
- MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
- Src.isPhysical() &&
- TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
- MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
- if (Src.isVirtual() &&
- MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
- Dst.isPhysical() &&
- TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
- MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
- if (!Dst.isVirtual() || !Src.isVirtual())
- continue;
- if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
- MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
- MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
- MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+ for (MachineBasicBlock &MBB : MF) {
+ DenseSet<MachineInstr *> instrsToUnpack;
+ for (MachineInstr &MI : MBB) {
+ if (SIInstrInfo::isMFMA(MI)){
+ createListOfPackedInstr(MI, instrsToUnpack);
+ }
+ if (ST.useRealTrue16Insts()){
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
}
- if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
- MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
- MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+
+ if (!instrsToUnpack.empty()) {
----------------
bcahoon wrote:
Why do you create instrsToUnpack at the top of the loop, but do the insertion at the bottom?
https://github.com/llvm/llvm-project/pull/151704
More information about the llvm-commits
mailing list