[llvm] [AMDGPU] Optimize away v_readfirstlane_b32 on SGPR input (PR #151033)
Josh Hutton via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 28 16:46:02 PDT 2025
https://github.com/JoshHuttonCode updated https://github.com/llvm/llvm-project/pull/151033
>From 3af615df7eb3538c031a91b551a64273ed20c364 Mon Sep 17 00:00:00 2001
From: Josh Hutton <joshhuttonemail at gmail.com>
Date: Tue, 22 Jul 2025 12:39:38 -0700
Subject: [PATCH 1/2] [AMDGPU] Optimize away v_readfirstlane_b32 on SGPR input
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 104 ++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +
.../AMDGPU/constant-address-space-32bit.ll | 2 +-
...way-v_readfirstlane_b32-on-sgpr-source.mir | 271 ++++++++++++++++++
4 files changed, 374 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2aa6b4e82f9d5..72d2d7300ec6d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6382,6 +6382,67 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
legalizeOpWithMove(MI, VOP3Idx[2]);
}
+// Recursively check to see if the ultimate source of a readfirstlane is SGPR.
+// If it is, readfirstlane can be omitted, and the source of the value can be
+// used directly.
+Register SIInstrInfo::checkIsSourceSGPR(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI,
+ int MaxDepth) const {
+ if (MaxDepth == 0)
+ return Register();
+
+ Register PotentialSGPR = MO.getReg();
+
+ // While we could return a physical SGPR source, we would need to guarantee it
+ // has not been redefined.
+ if (PotentialSGPR.isPhysical())
+ return Register();
+
+ assert(MRI.hasOneDef(PotentialSGPR));
+
+ MachineInstr *MI = MRI.getVRegDef(PotentialSGPR);
+ auto MIOpc = MI->getOpcode();
+ const TargetRegisterClass *RC = MRI.getRegClass(PotentialSGPR);
+
+ if (RI.hasSGPRs(RC))
+ return PotentialSGPR;
+
+ switch (MIOpc) {
+ case AMDGPU::COPY: {
+ MachineOperand CopySource = MI->getOperand(1);
+ return checkIsSourceSGPR(CopySource, MRI, TRI, MaxDepth - 1);
+ }
+ case AMDGPU::REG_SEQUENCE: {
+ unsigned SubRegToFind = MO.getSubReg();
+ unsigned SubRegOperandIndex = 2;
+ unsigned CopySourceIndex = 0;
+
+ // Since subregs may be listed out of order, we need to
+ // loop over operands to find the subreg we are looking for.
+ while (SubRegOperandIndex < MI->getNumOperands()) {
+ assert(MI->isOperandSubregIdx(SubRegOperandIndex));
+
+ unsigned SubRegIndex = MI->getOperand(SubRegOperandIndex).getImm();
+ if (SubRegIndex == SubRegToFind) {
+ CopySourceIndex = SubRegOperandIndex - 1;
+ break;
+ }
+
+ SubRegOperandIndex += 2;
+ }
+
+ if (SubRegOperandIndex >= MI->getNumOperands())
+ return Register();
+
+ MachineOperand CopySource = MI->getOperand(CopySourceIndex);
+ return checkIsSourceSGPR(CopySource, MRI, TRI, MaxDepth - 1);
+ }
+ default:
+ return Register();
+ }
+}
+
Register SIInstrInfo::readlaneVGPRToSGPR(
Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI,
const TargetRegisterClass *DstRC /*=nullptr*/) const {
@@ -6410,12 +6471,43 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
}
SmallVector<Register, 8> SRegs;
- for (unsigned i = 0; i < SubRegs; ++i) {
- Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
- get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
- .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
- SRegs.push_back(SGPR);
+ const MachineInstr *CopySourceInstr = MRI.getVRegDef(SrcReg);
+
+ if (CopySourceInstr->getOpcode() != AMDGPU::REG_SEQUENCE) {
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+ .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+ SRegs.push_back(SGPR);
+ }
+ } else {
+ SRegs.resize(SubRegs);
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ unsigned SubregOperandIndex = 2 + 2 * i;
+ assert(SubregOperandIndex < CopySourceInstr->getNumOperands());
+ assert(CopySourceInstr->isOperandSubregIdx(SubregOperandIndex));
+
+ MachineOperand RegSeqSrcOperand =
+ CopySourceInstr->getOperand(SubregOperandIndex - 1);
+ Register SGPRSource = checkIsSourceSGPR(RegSeqSrcOperand, MRI, TRI);
+
+ unsigned SubRegIndex =
+ CopySourceInstr->getOperand(SubregOperandIndex).getImm();
+ unsigned SubRegChannel = RI.getChannelFromSubReg(SubRegIndex);
+
+ if (SGPRSource) {
+ SRegs[SubRegChannel] = SGPRSource;
+ } else {
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+ .addReg(SrcReg, 0, SubRegIndex);
+ SRegs[SubRegChannel] = SGPR;
+ }
+ }
}
MachineInstrBuilder MIB =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59eb0f04..3e2c681805a3f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -186,6 +186,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool resultDependsOnExec(const MachineInstr &MI) const;
+ Register checkIsSourceSGPR(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI, int MaxDepth = 6) const;
+
protected:
/// If the specific machine instruction is a instruction that moves/copies
/// value from one register to another register return destination and source
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 52ccfe8ba3bfb..384e1c5a4cc50 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -300,8 +300,8 @@ define amdgpu_vs float @load_addr_no_fold(ptr addrspace(6) inreg noalias %p0) #0
}
; GCN-LABEL: {{^}}vgpr_arg_src:
-; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
; GCN: s_mov_b32 s[[ZERO:[0-9]+]]
+; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]]
define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) {
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir b/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
new file mode 100644
index 0000000000000..4eff26fccddd1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
@@ -0,0 +1,271 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s
+---
+name: v_readfirstlane_b32_omitted
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_omitted
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[DEF]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+ %3:sreg_64 = S_QUADMASK_B64 %2
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_omitted_switched_subregs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_omitted_switched_subregs
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub1, killed [[COPY1]], %subreg.sub0
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sreg_64 = REG_SEQUENCE %0, %subreg.sub1, killed %1, %subreg.sub0
+ %3:sreg_64 = S_QUADMASK_B64 killed %2
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_phys_vgpr_and_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_phys_vgpr_and_sgpr
+ ; CHECK: liveins: $vgpr0, $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = COPY $sgpr0
+ %2:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+ %3:sreg_64 = S_QUADMASK_B64 %2
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_both_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_both_vgpr
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+ %4:sreg_64 = S_QUADMASK_B64 %3
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_both_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_both_sgpr
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1
+ %4:sreg_64 = S_QUADMASK_B64 %3
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_vgpr
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY1]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = COPY %0
+ %3:sgpr_128 = REG_SEQUENCE killed %1, %subreg.sub1, killed %2, %subreg.sub0, killed %2, %subreg.sub2, killed %2, %subreg.sub3
+ %4:sreg_64 = REG_SEQUENCE killed %3.sub3, %subreg.sub1, %3.sub1, %subreg.sub0
+ %5:sreg_64 = S_QUADMASK_B64 killed %4
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_recursive_look_through_copies_and_reg_sequence_for_sgpr
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[DEF]], %subreg.sub1, killed [[COPY1]], %subreg.sub0, killed [[COPY1]], %subreg.sub2, killed [[COPY1]], %subreg.sub3
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sgpr_32 = COPY %0
+ %3:sgpr_128 = REG_SEQUENCE killed %1, %subreg.sub1, killed %2, %subreg.sub0, killed %2, %subreg.sub2, killed %2, %subreg.sub3
+ %4:sreg_64 = REG_SEQUENCE killed %3.sub3, %subreg.sub1, %3.sub1, %subreg.sub0
+ %5:sreg_64 = S_QUADMASK_B64 killed %4
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_undef_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_undef_subreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub2
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %2:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %0:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:sreg_32 = COPY %1
+ %5:sgpr_128 = REG_SEQUENCE killed %3, %subreg.sub1, killed %1, %subreg.sub0, killed %2, %subreg.sub2
+ %6:sreg_64 = REG_SEQUENCE killed %5.sub3, %subreg.sub1, %5.sub1, %subreg.sub0
+ %7:sreg_64 = S_QUADMASK_B64 killed %6
+ S_ENDPGM 0
+
+...
+
+---
+name: v_readfirstlane_b32_depth_limit
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: v_readfirstlane_b32_depth_limit
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+ ; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = COPY %1
+ %3:vgpr_32 = COPY %2
+ %4:vgpr_32 = COPY %3
+ %5:vgpr_32 = COPY %4
+ %6:vgpr_32 = COPY %5
+ %7:vgpr_32 = COPY %6
+ %8:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %7, %subreg.sub1
+ %9:sreg_64 = S_QUADMASK_B64 %8
+ S_ENDPGM 0
>From b304bc99eb4f9637860a80ba7997be76e4cdbce6 Mon Sep 17 00:00:00 2001
From: Josh Hutton <joshhuttonemail at gmail.com>
Date: Mon, 28 Jul 2025 16:24:23 -0700
Subject: [PATCH 2/2] Update checkIsSourceSGPR to be iterative, address
comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 104 ++++++++++--------
...way-v_readfirstlane_b32-on-sgpr-source.mir | 8 +-
2 files changed, 62 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 72d2d7300ec6d..98a511a5d84a2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6382,65 +6382,74 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
legalizeOpWithMove(MI, VOP3Idx[2]);
}
-// Recursively check to see if the ultimate source of a readfirstlane is SGPR.
-// If it is, readfirstlane can be omitted, and the source of the value can be
-// used directly.
+// Check to see if the ultimate source of a readfirstlane is SGPR. If it is,
+// readfirstlane can be omitted, and the source of the value can be used
+// directly.
Register SIInstrInfo::checkIsSourceSGPR(const MachineOperand &MO,
const MachineRegisterInfo &MRI,
const SIRegisterInfo *TRI,
int MaxDepth) const {
- if (MaxDepth == 0)
- return Register();
+ SmallVector<MachineOperand, 1> Worklist;
+ Worklist.push_back(MO);
- Register PotentialSGPR = MO.getReg();
+ while (!Worklist.empty() && MaxDepth > 0) {
+ MachineOperand MOperand = Worklist.pop_back_val();
- // While we could return a physical SGPR source, we would need to guarantee it
- // has not been redefined.
- if (PotentialSGPR.isPhysical())
- return Register();
+ Register PotentialSGPR = MOperand.getReg();
+
+ // While we could return a physical SGPR source, we would need to guarantee
+ // it has not been redefined.
+ if (PotentialSGPR.isPhysical())
+ return Register();
- assert(MRI.hasOneDef(PotentialSGPR));
+ assert(MRI.hasOneDef(PotentialSGPR));
- MachineInstr *MI = MRI.getVRegDef(PotentialSGPR);
- auto MIOpc = MI->getOpcode();
- const TargetRegisterClass *RC = MRI.getRegClass(PotentialSGPR);
+ MachineInstr *MI = MRI.getVRegDef(PotentialSGPR);
+ unsigned MIOpc = MI->getOpcode();
+ const TargetRegisterClass *RC = MRI.getRegClass(PotentialSGPR);
- if (RI.hasSGPRs(RC))
- return PotentialSGPR;
+ if (RI.hasSGPRs(RC))
+ return PotentialSGPR;
- switch (MIOpc) {
- case AMDGPU::COPY: {
- MachineOperand CopySource = MI->getOperand(1);
- return checkIsSourceSGPR(CopySource, MRI, TRI, MaxDepth - 1);
- }
- case AMDGPU::REG_SEQUENCE: {
- unsigned SubRegToFind = MO.getSubReg();
- unsigned SubRegOperandIndex = 2;
- unsigned CopySourceIndex = 0;
+ switch (MIOpc) {
+ case AMDGPU::COPY: {
+ MachineOperand CopySource = MI->getOperand(1);
+ Worklist.push_back(CopySource);
+ break;
+ }
+ case AMDGPU::REG_SEQUENCE: {
+ unsigned SubRegToFind = MOperand.getSubReg();
+ unsigned SubRegOperandIndex = 2;
+ unsigned CopySourceIndex = 0;
- // Since subregs may be listed out of order, we need to
- // loop over operands to find the subreg we are looking for.
- while (SubRegOperandIndex < MI->getNumOperands()) {
- assert(MI->isOperandSubregIdx(SubRegOperandIndex));
+ // Since subregs may be listed out of order, we need to
+ // loop over operands to find the subreg we are looking for.
+ while (SubRegOperandIndex < MI->getNumOperands()) {
+ assert(MI->isOperandSubregIdx(SubRegOperandIndex));
- unsigned SubRegIndex = MI->getOperand(SubRegOperandIndex).getImm();
- if (SubRegIndex == SubRegToFind) {
- CopySourceIndex = SubRegOperandIndex - 1;
- break;
+ unsigned SubRegIndex = MI->getOperand(SubRegOperandIndex).getImm();
+ if (SubRegIndex == SubRegToFind) {
+ CopySourceIndex = SubRegOperandIndex - 1;
+ break;
+ }
+
+ SubRegOperandIndex += 2;
}
- SubRegOperandIndex += 2;
- }
+ if (SubRegOperandIndex >= MI->getNumOperands())
+ return Register();
- if (SubRegOperandIndex >= MI->getNumOperands())
+ MachineOperand CopySource = MI->getOperand(CopySourceIndex);
+ Worklist.push_back(CopySource);
+ break;
+ }
+ default:
return Register();
-
- MachineOperand CopySource = MI->getOperand(CopySourceIndex);
- return checkIsSourceSGPR(CopySource, MRI, TRI, MaxDepth - 1);
- }
- default:
- return Register();
+ }
+ --MaxDepth;
}
+
+ return Register();
}
Register SIInstrInfo::readlaneVGPRToSGPR(
@@ -6471,6 +6480,8 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
}
SmallVector<Register, 8> SRegs;
+ SmallVector<unsigned, 8> SRegIndices;
+
const MachineInstr *CopySourceInstr = MRI.getVRegDef(SrcReg);
if (CopySourceInstr->getOpcode() != AMDGPU::REG_SEQUENCE) {
@@ -6480,9 +6491,9 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
.addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
SRegs.push_back(SGPR);
+ SRegIndices.push_back(i);
}
} else {
- SRegs.resize(SubRegs);
const SIRegisterInfo *TRI = ST.getRegisterInfo();
for (unsigned i = 0; i < SubRegs; ++i) {
@@ -6499,14 +6510,15 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
unsigned SubRegChannel = RI.getChannelFromSubReg(SubRegIndex);
if (SGPRSource) {
- SRegs[SubRegChannel] = SGPRSource;
+ SRegs.push_back(SGPRSource);
} else {
Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
.addReg(SrcReg, 0, SubRegIndex);
- SRegs[SubRegChannel] = SGPR;
+ SRegs.push_back(SGPR);
}
+ SRegIndices.push_back(SubRegChannel);
}
}
@@ -6515,7 +6527,7 @@ Register SIInstrInfo::readlaneVGPRToSGPR(
get(AMDGPU::REG_SEQUENCE), DstReg);
for (unsigned i = 0; i < SubRegs; ++i) {
MIB.addReg(SRegs[i]);
- MIB.addImm(RI.getSubRegFromChannel(i));
+ MIB.addImm(RI.getSubRegFromChannel(SRegIndices[i]));
}
return DstReg;
}
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir b/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
index 4eff26fccddd1..fa2c1d7763262 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-away-v_readfirstlane_b32-on-sgpr-source.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s
---
name: v_readfirstlane_b32_omitted
tracksRegLiveness: true
@@ -43,7 +43,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub1, killed [[COPY1]], %subreg.sub0
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub1, [[DEF]], %subreg.sub0
; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE1]]
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
@@ -156,7 +156,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY1]], %subreg.sub2, [[COPY1]], %subreg.sub3
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub1, [[DEF]], %subreg.sub0
; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
; CHECK-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY $vgpr0
@@ -217,7 +217,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub1, [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub2
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[REG_SEQUENCE]].sub3, %subreg.sub1, [[REG_SEQUENCE]].sub1, %subreg.sub0
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE1]].sub1, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, [[V_READFIRSTLANE_B32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub1, [[DEF]], %subreg.sub0
; CHECK-NEXT: [[S_QUADMASK_B64_:%[0-9]+]]:sreg_64 = S_QUADMASK_B64 killed [[REG_SEQUENCE2]]
; CHECK-NEXT: S_ENDPGM 0
%2:vgpr_32 = COPY $vgpr2
More information about the llvm-commits
mailing list