[llvm] AMDGPU: Stop special casing aligned VGPR targets in operand folding (PR #155559)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 08:38:15 PDT 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/155559
>From 9b44120bd568dbe09a8753525deb9b0eed72a1b7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 27 Aug 2025 11:50:21 +0900
Subject: [PATCH 1/3] AMDGPU: Stop special casing aligned VGPR targets in
operand folding
Perform a register class constraint check when performing the fold
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 59 +++++++++++--------
.../CodeGen/AMDGPU/si-fold-aligned-agprs.mir | 15 ++---
.../CodeGen/AMDGPU/si-fold-aligned-vgprs.mir | 15 ++---
3 files changed, 47 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3979e1e0c44aa..a116b57c85a88 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,7 @@ struct FoldCandidate {
class SIFoldOperandsImpl {
public:
+ MachineFunction *MF;
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
@@ -705,6 +706,36 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.Def.OpToFold;
+
+ // Verify the register is compatible with the operand.
+ if (const TargetRegisterClass *OpRC =
+ TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI, *MF)) {
+ const TargetRegisterClass *OldRC = MRI->getRegClass(Old.getReg());
+ const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+ unsigned NewSubReg = New->getSubReg();
+ unsigned OldSubReg = Old.getSubReg();
+
+ const TargetRegisterClass *ConstrainRC = OpRC;
+ if (NewSubReg && OldSubReg) {
+ unsigned PreA, PreB;
+ ConstrainRC = TRI->getCommonSuperRegClass(OpRC, OldSubReg, NewRC,
+ NewSubReg, PreA, PreB);
+ } else if (OldSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(OldRC, OpRC, OldSubReg);
+ } else if (NewSubReg) {
+ ConstrainRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, NewSubReg);
+ }
+
+ if (!ConstrainRC)
+ return false;
+
+ if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+ LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
+ << TRI->getRegClassName(ConstrainRC) << '\n');
+ return false;
+ }
+ }
+
// Rework once the VS_16 register class is updated to include proper
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
@@ -1429,30 +1460,9 @@ void SIFoldOperandsImpl::foldOperand(
return;
}
- if (!FoldingImmLike) {
- if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
- // Don't fold if OpToFold doesn't hold an aligned register.
- const TargetRegisterClass *RC =
- TRI->getRegClassForReg(*MRI, OpToFold.getReg());
- assert(RC);
- if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
- unsigned SubReg = OpToFold.getSubReg();
- if (const TargetRegisterClass *SubRC =
- TRI->getSubRegisterClass(RC, SubReg))
- RC = SubRC;
- }
-
- if (!RC || !TRI->isProperlyAlignedRC(*RC))
- return;
- }
-
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
-
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunities. The shrink operands pass
- // already does this.
- return;
- }
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunities. The shrink operands pass
+ // already does this.
tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
}
@@ -2747,6 +2757,7 @@ bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
}
bool SIFoldOperandsImpl::run(MachineFunction &MF) {
+ this->MF = &MF;
MRI = &MF.getRegInfo();
ST = &MF.getSubtarget<GCNSubtarget>();
TII = ST->getInstrInfo();
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
index a0ea04b1b9c0f..8326862706a02 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
@@ -31,9 +31,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_64 = IMPLICIT_DEF
%2:areg_64_align2 = COPY killed %1
@@ -105,9 +104,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_96 = IMPLICIT_DEF
%2:areg_96_align2 = COPY killed %1
@@ -234,9 +232,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_128 = IMPLICIT_DEF
%2:areg_128_align2 = COPY killed %1
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index a54c0accce783..9dd025a3da086 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -46,9 +46,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_64 = IMPLICIT_DEF
%2:vreg_64_align2 = COPY killed %1
@@ -148,9 +147,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_96 = IMPLICIT_DEF
%2:vreg_96_align2 = COPY killed %1
@@ -326,9 +324,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_128 = IMPLICIT_DEF
%2:vreg_128_align2 = COPY killed %1
>From f5a6c5751553312dc0f8008d22c66863dd302ae4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 29 Aug 2025 20:58:03 +0900
Subject: [PATCH 2/3] Add rocFFT regression test
---
.../CodeGen/AMDGPU/si-fold-aligned-vgprs.mir | 49 +++++++++++++++++++
1 file changed, 49 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index 9dd025a3da086..7d5493bb7b538 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -331,3 +331,52 @@ body: |
%2:vreg_128_align2 = COPY killed %1
GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, implicit $exec
...
+
+# Make sure the alignment requirement is respected for VS_64 operand
+# uses.
+---
+name: aligned_vgpr_vs_64_constraint
+tracksRegLiveness: true
+isSSA: true
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr8_sgpr9
+
+ ; GFX908-LABEL: name: aligned_vgpr_regression
+ ; GFX908: liveins: $vgpr0, $sgpr8_sgpr9
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX908-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0
+ ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; GFX908-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ ; GFX908-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90A-LABEL: name: aligned_vgpr_regression
+ ; GFX90A: liveins: $vgpr0, $sgpr8_sgpr9
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, killed [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ %0:sgpr_64 = COPY $sgpr8_sgpr9
+ %1:vgpr_32 = COPY $vgpr0
+ %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR %0, %1, 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ %3:vgpr_32 = COPY %2.sub0
+ %4:vreg_64_align2 = COPY killed %2.sub1_sub2
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:vreg_64_align2 = REG_SEQUENCE %3, %subreg.sub0, %5, %subreg.sub1
+ %7:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed %6, 0, killed %4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE_B64_gfx9 %5, killed %7, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ S_ENDPGM 0
+
+...
>From 1b865475e8c1bcf956e25399779f98c6345e407a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 3 Sep 2025 00:37:50 +0900
Subject: [PATCH 3/3] fix test
---
llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index 7d5493bb7b538..5f9b71c0c2198 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -342,7 +342,7 @@ body: |
bb.0.entry:
liveins: $vgpr0, $sgpr8_sgpr9
- ; GFX908-LABEL: name: aligned_vgpr_regression
+ ; GFX908-LABEL: name: aligned_vgpr_vs_64_constraint
; GFX908: liveins: $vgpr0, $sgpr8_sgpr9
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
@@ -355,7 +355,7 @@ body: |
; GFX908-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
; GFX908-NEXT: S_ENDPGM 0
;
- ; GFX90A-LABEL: name: aligned_vgpr_regression
+ ; GFX90A-LABEL: name: aligned_vgpr_vs_64_constraint
; GFX90A: liveins: $vgpr0, $sgpr8_sgpr9
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
More information about the llvm-commits
mailing list