[llvm-branch-commits] [llvm] AMDGPU: Handle rewriting VGPR MFMA to AGPR with subregister copies (PR #153019)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Aug 11 07:39:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
This should address the case where the result isn't fully used,
resulting in partial copy bundles from the MFMA result.
---
Patch is 56.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153019.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp (+45-61)
- (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir (-91)
- (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir (+89-8)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll (+36-62)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8dfdbe5dd494..5206f32ec99e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,8 +14,6 @@
/// MFMA opcode.
///
/// TODO:
-/// - Handle SplitKit partial copy bundles, and not just full copy instructions
-///
/// - Update LiveIntervals incrementally instead of recomputing from scratch
///
//===----------------------------------------------------------------------===//
@@ -37,6 +35,7 @@ using namespace llvm;
namespace {
class AMDGPURewriteAGPRCopyMFMAImpl {
+ MachineFunction &MF;
const GCNSubtarget &ST;
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
@@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
LiveRegMatrix &LRM, LiveIntervals &LIS,
const RegisterClassInfo &RegClassInfo)
- : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
LIS(LIS), RegClassInfo(RegClassInfo) {}
@@ -71,26 +70,26 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
///
/// \p RewriteRegs will accumulate the set of register used by those MFMAs
/// that need to have the register classes adjusted.
- const TargetRegisterClass *recomputeRegClassExceptRewritable(
- Register Reg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+ bool recomputeRegClassExceptRewritable(
+ Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const;
bool run(MachineFunction &MF) const;
};
-const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
- Register StartReg, const TargetRegisterClass *OldRC,
- const TargetRegisterClass *NewRC,
- SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
+ Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
SmallSetVector<Register, 4> &RewriteRegs) const {
SmallVector<Register, 8> Worklist = {StartReg};
// Recursively visit all transitive MFMA users
while (!Worklist.empty()) {
Register Reg = Worklist.pop_back_val();
+ const TargetRegisterClass *OldRC = MRI.getRegClass(Reg);
+
+ // Inflate to the equivalent AV_* class.
+ const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF);
+
// Accumulate constraints from all uses.
for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
// Apply the effect of the given operand to NewRC.
@@ -101,23 +100,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
// either AGPR or VGPR in src0/src1, so don't bother checking the
// constraint effects of the individual operands.
if (isRewriteCandidate(*MI)) {
- for (AMDGPU::OpName OpName :
- {AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) {
- const MachineOperand *Op = TII.getNamedOperand(*MI, OpName);
+ const MachineOperand *VDst =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+ const MachineOperand *Src2 =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
+ for (const MachineOperand *Op : {VDst, Src2}) {
if (!Op->isReg())
continue;
Register OtherReg = Op->getReg();
- if (OtherReg != Reg) {
- if (RewriteRegs.insert(OtherReg))
- Worklist.push_back(OtherReg);
- }
+ if (OtherReg.isPhysical())
+ return false;
+
+ if (OtherReg != Reg && RewriteRegs.insert(OtherReg))
+ Worklist.push_back(OtherReg);
}
- LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI);
+ if (!is_contained(RewriteCandidates, MI)) {
+ LLVM_DEBUG({
+ Register VDstPhysReg = VRM.getPhys(VDst->getReg());
+ dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
+ << " Dst=[" << printReg(VDst->getReg()) << " => "
+ << printReg(VDstPhysReg, &TRI);
+
+ if (Src2->isReg()) {
+ Register Src2PhysReg = VRM.getPhys(Src2->getReg());
+ dbgs() << ", Src2=[" << printReg(Src2->getReg(), &TRI) << " => "
+ << printReg(Src2PhysReg, &TRI);
+ }
+
+ dbgs() << "]: " << MI;
+ });
- if (!is_contained(RewriteCandidates, MI))
RewriteCandidates.push_back(MI);
+ }
continue;
}
@@ -126,13 +142,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
if (!NewRC || NewRC == OldRC) {
LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI)
- << " cannot be reassigned to AGPR: " << *MI);
- return nullptr;
+ << " cannot be reassigned to "
+ << TRI.getRegClassName(NewRC) << ": " << *MI);
+ return false;
}
}
}
- return NewRC;
+ return true;
}
/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
@@ -225,10 +242,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
for (VNInfo *VNI : LI.vnis()) {
MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
-
- // TODO: Handle SplitKit produced copy bundles for partially defined
- // registers.
- if (!DefMI || !DefMI->isFullCopy())
+ if (!DefMI || !DefMI->isCopy())
continue;
Register MFMADstReg = DefMI->getOperand(1).getReg();
@@ -241,34 +255,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
if (!MFMA || !isRewriteCandidate(*MFMA))
continue;
- MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
- Register Src2Reg;
- if (Src2->isReg()) {
- Src2Reg = Src2->getReg();
- if (!Src2Reg.isVirtual())
- continue;
- }
-
- // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
- // of an AGPR or VGPR subclass, so we can't simply use the result on the
- // assignment.
-
- LLVM_DEBUG({
- dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
- << " Dst=[" << printReg(VReg) << " => "
- << printReg(PhysReg, &TRI);
-
- if (Src2Reg) {
- Register Src2PhysReg = VRM.getPhys(Src2Reg);
- dbgs() << ", Src2=[" << printReg(Src2Reg, &TRI) << " => "
- << printReg(Src2PhysReg, &TRI);
- }
-
- dbgs() << "]: " << *MFMA;
- });
-
- const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
-
// src2 and dst have the same physical class constraint; try to preserve
// the original src2 subclass if one were to exist.
SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
@@ -287,11 +273,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
//
// Note recomputeRegClassExceptRewritable will consider the constraints of
// this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
- const TargetRegisterClass *DstExceptRC =
- recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
- RewriteCandidates, RewriteRegs);
- if (!DstExceptRC) {
- LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
+ if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
+ RewriteRegs)) {
+ LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
<< printReg(MFMADstReg, &TRI) << '\n');
continue;
}
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index 3103d635200c6..45c185b6154ea 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -20,10 +20,6 @@
ret void
}
- define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
- ret void
- }
-
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
ret void
}
@@ -420,93 +416,6 @@ body: |
...
-# Non-mac variant, src2 is the same VGPR, but a different subregister.
----
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
-tracksRegLiveness: true
-machineFunctionInfo:
- isEntryFunction: true
- stackPtrOffsetReg: '$sgpr32'
- occupancy: 10
- sgprForEXECCopy: '$sgpr100_sgpr101'
-body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
- ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
- ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
- ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
- ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- bb.0:
- S_NOP 0, implicit-def $agpr0
- renamable $sgpr0 = S_MOV_B32 0
- undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
- renamable $sgpr1 = COPY renamable $sgpr0
- %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
- renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- %0.sub9:vreg_1024_align2 = COPY %0.sub8
-
- bb.1:
- liveins: $vcc
-
- undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
- S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- S_BRANCH %bb.2
-
- bb.2:
- ; No VGPRs available for %0
- S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- S_ENDPGM 0
-
-...
-
# There isn't an assignable AGPR around the first MFMA.
---
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 3de86da766af7..735c7e081b21a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -1116,11 +1116,8 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $agpr0_agpr1
; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1202,10 +1199,8 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
- ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $agpr0_agpr1
- ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1957,3 +1952,89 @@ body: |
S_ENDPGM 0
...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/153019
More information about the llvm-branch-commits
mailing list