[llvm-branch-commits] [llvm] AMDGPU: Handle rewriting VGPR MFMAs with immediate src2 (PR #150566)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Jul 26 16:58:08 PDT 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/150566
>From 8a0bba305ae33515acd347395761ed8ddb82ea3f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 25 Jul 2025 10:23:31 +0900
Subject: [PATCH] AMDGPU: Handle rewriting VGPR MFMAs with immediate src2
This can follow the simpler tied operand handling path.
---
.../AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 26 +++---
...class-vgpr-mfma-to-agpr-negative-tests.mir | 87 -------------------
...class-vgpr-mfma-to-av-with-load-source.mir | 82 +++++++++++++++++
3 files changed, 96 insertions(+), 99 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index dd87b196a24ef..4754596cf3bb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -206,19 +206,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
continue;
MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
- if (!Src2->isReg())
- continue;
-
- Register Src2Reg = Src2->getReg();
- if (!Src2Reg.isVirtual())
- continue;
+ Register Src2Reg;
+ if (Src2->isReg()) {
+ Src2Reg = Src2->getReg();
+ if (!Src2Reg.isVirtual())
+ continue;
+ }
// FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
// of an AGPR or VGPR subclass, so we can't simply use the result on the
// assignment.
LLVM_DEBUG({
- Register Src2PhysReg = VRM.getPhys(Src2->getReg());
+ Register Src2PhysReg = VRM.getPhys(Src2Reg);
dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
<< " Dst=[" << printReg(VReg) << " => "
<< printReg(PhysReg, &TRI) << "], Src2=["
@@ -226,7 +226,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
<< printReg(Src2PhysReg, &TRI) << "]: " << *MFMA;
});
- const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(Src2->getReg());
+ const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
const TargetRegisterClass *NewDstConstraintRC =
TII.getRegClass(TII.get(AGPROp), 0, &TRI, MF);
const TargetRegisterClass *NewSrc2ConstraintRC = NewDstConstraintRC;
@@ -236,8 +236,8 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
MF) &&
"expected src2 and dst to have same class constraint");
- const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg);
-
+ // src2 and dst have the same physical class constraint; try to preserve
+ // the original src2 subclass if one were to exist.
SmallVector<MachineInstr *, 4> DstRewriteCandidates;
// We've found av = COPY (MFMA), and need to verify that we can trivially
@@ -267,8 +267,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
// If the inputs are tied and the same register, we can shortcut and
// directly replace the register.
- if (Src2->getReg() != MFMADstReg ||
- Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
+ if (Src2Reg && (Src2Reg != MFMADstReg ||
+ Src2->getSubReg() != DefMI->getOperand(1).getSubReg())) {
+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg);
+
// If src2 and dst are different registers, we need to also reassign the
// input to an available AGPR if it is compatible with all other uses.
//
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index b4716a293284a..c9347f5321f3e 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -16,10 +16,6 @@
ret void
}
- define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2() #0 {
- ret void
- }
-
define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2() #0 {
ret void
}
@@ -345,89 +341,6 @@ body: |
...
-# Non-mac variant, src2 is an immediate.
----
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
-tracksRegLiveness: true
-machineFunctionInfo:
- isEntryFunction: true
- stackPtrOffsetReg: '$sgpr32'
- occupancy: 10
- sgprForEXECCopy: '$sgpr100_sgpr101'
-body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
- ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
- ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
- ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- bb.0:
- S_NOP 0, implicit-def $agpr0
- renamable $sgpr0 = S_MOV_B32 0
- undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
- renamable $sgpr1 = COPY renamable $sgpr0
- %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
- renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- %0.sub9:vreg_512_align2 = COPY %0.sub8
-
- bb.1:
- liveins: $vcc
-
- %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_CBRANCH_VCCNZ %bb.1, implicit $vcc
- S_BRANCH %bb.2
-
- bb.2:
- ; No VGPRs available for %0
- S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
- S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
- S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
- S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
- S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
- S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
- S_ENDPGM 0
-
-...
-
# Non-mac variant, src2 is a physical register
---
name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 8cce41360ab82..8deaac4943990 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -1183,3 +1183,85 @@ body: |
S_ENDPGM 0
...
+
+# Non-mac variant, src2 is an immediate.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
More information about the llvm-branch-commits
mailing list