[llvm] [AMDGPU] Generate COPY for each use-constraint instead of constraining the register class (PR #182104)
Chinmay Deshpande via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 18 11:00:54 PST 2026
https://github.com/chinmaydd created https://github.com/llvm/llvm-project/pull/182104
Follow-up to https://github.com/llvm/llvm-project/pull/181909
>From ac0c3c8deb7c0eb42beea47bbf997b4f70b1c723 Mon Sep 17 00:00:00 2001
From: Chinmay Deshpande <chdeshpa at amd.com>
Date: Wed, 18 Feb 2026 13:58:33 -0500
Subject: [PATCH] [AMDGPU] Generate COPY for each use-constraint instead of
constraining the register class
Change-Id: I80cc6ce1a55036c0744515d06c26b4255a62a8ae
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 49 ++++++++++---------
.../fix-sgpr-copies-wmma-scale-lo256.mir | 10 ++--
2 files changed, 32 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5a049bb10515..944bdfce688ac 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8319,26 +8319,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
if (const TargetRegisterClass *CommonRC =
RI.getCommonSubClass(NewDstRC, SrcRC)) {
- // Also intersect with VGPR-compatible operand register class
- // constraints from user instructions. This preserves restricted
- // register classes (e.g., VGPR_32_Lo256 for WMMA scale operands) that
- // would otherwise be lost when an SGPR is replaced with a VGPR.
- // Constraints incompatible with VGPRs (e.g., SALU instructions
- // requiring SReg_32) are skipped because those users will be converted
- // to VALU by the worklist.
- for (const MachineOperand &UseMO : MRI.use_operands(DstReg)) {
- const MachineInstr *UseMI = UseMO.getParent();
- if (UseMI == &Inst)
- continue;
- unsigned OpIdx = UseMI->getOperandNo(&UseMO);
- if (const TargetRegisterClass *OpRC =
- getRegClass(UseMI->getDesc(), OpIdx)) {
- if (const TargetRegisterClass *Narrowed =
- RI.getCommonSubClass(CommonRC, OpRC))
- CommonRC = Narrowed;
- }
- }
-
// Instead of creating a copy where src and dst are the same register
// class, we just replace all uses of dst with src. These kinds of
// copies interfere with the heuristics MachineSink uses to decide
@@ -8354,10 +8334,33 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
llvm_unreachable("failed to constrain register");
Inst.eraseFromParent();
- // Legalize t16 operand since replaceReg is called after addUsersToVALU
- for (MachineOperand &MO :
+
+ const TargetRegisterClass *NewDstRegRC = MRI.getRegClass(NewDstReg);
+ for (MachineOperand &UseMO :
make_early_inc_range(MRI.use_operands(NewDstReg))) {
- legalizeOperandsVALUt16(*MO.getParent(), MRI);
+ MachineInstr &UseMI = *UseMO.getParent();
+
+ // Legalize t16 operands since replaceReg is called after
+ // addUsersToVALU.
+ legalizeOperandsVALUt16(UseMI, MRI);
+
+ // If a user operand requires a narrower register class than
+ // NewDstReg (e.g., VGPR_32_Lo256 for WMMA scale operands), emit
+ // a COPY to a new register with the correct class.
+ unsigned OpIdx = UseMI.getOperandNo(&UseMO);
+ const TargetRegisterClass *OpRC =
+ getRegClass(UseMI.getDesc(), OpIdx);
+ if (!OpRC)
+ continue;
+ const TargetRegisterClass *NarrowRC =
+ RI.getCommonSubClass(NewDstRegRC, OpRC);
+ if (!NarrowRC || NarrowRC == NewDstRegRC)
+ continue;
+ Register CopyReg = MRI.createVirtualRegister(NarrowRC);
+ BuildMI(*UseMI.getParent(), &UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::COPY), CopyReg)
+ .addReg(NewDstReg);
+ UseMO.setReg(CopyReg);
}
return;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wmma-scale-lo256.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wmma-scale-lo256.mir
index 4cead3056d808..d4b645cdbeb7f 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wmma-scale-lo256.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wmma-scale-lo256.mir
@@ -19,12 +19,13 @@ body: |
; CHECK-LABEL: name: wmma_scale_copy_vgpr_to_sgpr
; CHECK: liveins: $vgpr0, $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32_lo256 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[COPY]], [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32_lo256 = COPY [[COPY]]
+ ; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[COPY1]], [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
%1:sreg_32 = COPY %0
@@ -47,12 +48,13 @@ body: |
; CHECK-LABEL: name: wmma_scale_copy_vgpr_to_sgpr_src1
; CHECK: liveins: $vgpr0, $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32_lo256 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
- ; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32_lo256 = COPY [[COPY]]
+ ; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[COPY1]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
%1:sreg_32 = COPY %0
More information about the llvm-commits
mailing list