[llvm] [AMDGPU] SIPeepholeSDWA: Add REG_SEQUENCE support (PR #133087)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 28 08:06:39 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Frederik Harwath (frederik-h)
<details>
<summary>Changes</summary>
The REG_SEQUENCE instruction represents a copy of several registers to the subregisters of a target register. The "si-peephole-sdwa" pass currently cannot handle such instructions as operands of machine instructions that are considered for the conversion to SDWA.
This commit extends the SDWASrcOperand implementation to allow it to treat the use of a subregister of a REG_SEQUENCE in an SDWASrcOperand as if the source register of the copy had been used directly.
Fixes #<!-- -->130102
---
Full diff: https://github.com/llvm/llvm-project/pull/133087.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (+143-16)
- (added) llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir (+55)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..d46d0995b8fa8 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -26,6 +26,9 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
#include <optional>
using namespace llvm;
@@ -278,11 +281,16 @@ static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
}
}
+static bool isSameReg(const MachineOperand &Op, Register Reg) {
+ return Op.isReg() && Op.getReg() == Reg;
+}
+
+static bool isSameReg(const MachineOperand &Op, Register Reg, unsigned SubReg) {
+ return isSameReg(Op, Reg) && Op.getSubReg() == SubReg;
+}
+
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
- return LHS.isReg() &&
- RHS.isReg() &&
- LHS.getReg() == RHS.getReg() &&
- LHS.getSubReg() == RHS.getSubReg();
+ return RHS.isReg() && isSameReg(LHS, RHS.getReg(), RHS.getSubReg());
}
static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
@@ -382,6 +390,97 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
+// The following functions are helpers for dealing with REG_SEQUENCE
+// instructions. Those instructions are used to represent copies to
+// subregisters in SSA form.
+//
+// This pass should be able to peak through REG_SEQUENCE
+// instructions. An access to a subregister of a register defined
+// by a REG_SEQUENCE should be handled as if the register
+// that is being copied to the subregister was accessed.
+// Consider the following example:
+// %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0
+// %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0
+// %3:sreg_32 = S_MOV_B32 255
+// %4:vgpr_32 = V_AND_B32_e64 %2, %3
+// %5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %1, %4, 0
+//
+// The V_ADD_CO_U32_e64 instructions will be combined with the
+// V_AND_B32_e64 into an SDWA instruction.
+//
+// If one or more of the operands of V_ADD_CO_U32_e64 are accessed
+// through the subregisters of a REG_SEQUENCE as in the following
+// variation of the previous example, the optimization should still be
+// able to proceed in the same way:
+//
+// [...]
+// %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+// %5:sreg_32 = S_MOV_B32 255
+// %6:vgpr_32 = V_AND_B32_e64 %2, %5
+// %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+// %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0
+//
+// To this end, the SDWASrcOperand implementation uses the following
+// functions to find out the register that is used as the source of
+// the subregister value and it uses this register directly instead of
+// the REG_SEQUENCE subregister.
+
+/// Return the subregister of the REG_SEQUENCE \p RegSeq
+/// which is copied from \p Op, i.e. the operand following
+/// \p Op in the operands of \p RegSeq, or nullopt if the
+/// the \p Op is not an operand of \p RegSeq.
+static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
+ Register Reg) {
+ if (!RegSeq.isRegSequence())
+ return {};
+
+ auto *End = RegSeq.operands_end();
+ // Operand pair at indices (i+1, i+2) is (register, subregister)
+ for (auto *It = RegSeq.operands_begin() + 1; It != End; It += 2) {
+ if (isSameReg(*It, Reg))
+ return (It + 1)->getImm();
+ }
+
+ return {};
+}
+
+/// Return the single use of \p RegSeq which accesses the subregister
+/// that copies from \p Reg. Returns nullptr if \p Reg is not used by
+/// exactly one operand of \p RegSeq.
+static MachineInstr *regSequenceFindSingleSubregUse(MachineInstr &RegSeq,
+ Register Reg,
+ MachineRegisterInfo *MRI) {
+ Register SeqReg = RegSeq.getOperand(0).getReg();
+ unsigned SubReg = *regSequenceFindSubreg(RegSeq, Reg);
+
+ MachineInstr *SingleUse = nullptr;
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(SeqReg))
+ for (auto &Op : UseMI.operands())
+ if (Op.isReg() && Op.getReg() == SeqReg && Op.getSubReg() == SubReg) {
+ if (SingleUse)
+ return nullptr;
+ SingleUse = &UseMI;
+ }
+
+ return SingleUse;
+}
+
+/// If \p MI uses operand \p Reg and \p is defined by a copy-like
+/// instruction (currently, only REG_SEQUENCE is supported), this
+/// returns the instruction which defines the source register of the
+/// copy.
+static MachineInstr *findUseSrc(MachineInstr &MI, MachineOperand &Reg,
+ MachineRegisterInfo *MRI) {
+ assert(Reg.isReg());
+
+ // TODO Handle other copy-like ops?
+ if (!MI.isRegSequence())
+ return &MI;
+
+ MachineInstr *Use = regSequenceFindSingleSubregUse(MI, Reg.getReg(), MRI);
+ return Use;
+}
+
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches) {
@@ -391,12 +490,14 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
if (!Reg->isReg() || !Reg->isDef())
return nullptr;
- for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
- // Check that all instructions that use Reg can be converted
- if (!isConvertibleToSDWA(UseMI, ST, TII) ||
- !canCombineSelections(UseMI, TII))
+ // Check that all instructions that use Reg can be converted
+ for (MachineInstr &UseMI :
+ getMRI()->use_nodbg_instructions(Reg->getReg())) {
+ MachineInstr *SrcMI = findUseSrc(UseMI, *Reg, getMRI());
+ if (!SrcMI || !isConvertibleToSDWA(*SrcMI, ST, TII) ||
+ !canCombineSelections(*SrcMI, TII))
return nullptr;
-
+ }
// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
@@ -404,8 +505,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
assert(isSameReg(UseMO, *Reg));
SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
- MachineInstr *UseMI = UseMO.getParent();
- potentialMatchesMap[UseMI].push_back(this);
+ MachineInstr *UseSrcMI = findUseSrc(*UseMO.getParent(), *Reg, getMRI());
+ potentialMatchesMap[UseSrcMI].push_back(this);
}
return nullptr;
}
@@ -417,10 +518,36 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
return nullptr;
MachineInstr *Parent = PotentialMO->getParent();
+ if (Parent->isRegSequence()) {
+ Parent = regSequenceFindSingleSubregUse(
+ *Parent, getReplacedOperand()->getReg(), getMRI());
+ return Parent && canCombineSelections(*Parent, TII) ? Parent : nullptr;
+ }
return canCombineSelections(*Parent, TII) ? Parent : nullptr;
}
+/// Returns true if \p RHS is either the same register as LHS or the
+/// defining instruction of \p LHS is a REG_SEQUENCE in which \p
+/// RHS occurs as the operand for the register that corresponds to the
+/// subregister of LHS.
+static bool isSameRegOrCopy(const MachineOperand &LHS,
+ const MachineOperand &RHS,
+ const MachineRegisterInfo *MRI) {
+ if (isSameReg(LHS, RHS))
+ return true;
+
+ const MachineOperand *Def = findSingleRegDef(&LHS, MRI);
+ const MachineInstr *MI = Def ? Def->getParent() : nullptr;
+
+ // TODO Handle other copy-like instructions?
+ if (!MI || !MI->isRegSequence())
+ return false;
+
+ auto SubReg = regSequenceFindSubreg(*MI, RHS.getReg());
+ return SubReg && LHS.getSubReg() == SubReg;
+}
+
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
switch (MI.getOpcode()) {
case AMDGPU::V_CVT_F32_FP8_sdwa:
@@ -439,14 +566,13 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MachineOperand *SrcMods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
assert(Src && (Src->isReg() || Src->isImm()));
- if (!isSameReg(*Src, *getReplacedOperand())) {
+ if (!isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// If this is not src0 then it could be src1
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
- if (!Src ||
- !isSameReg(*Src, *getReplacedOperand())) {
+ if (!Src || !isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// It's possible this Src is a tied operand for
// UNUSED_PRESERVE, in which case we can either
// abandon the peephole attempt, or if legal we can
@@ -486,13 +612,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
- !isSameReg(*Src, *getReplacedOperand())) {
+ !isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
// src2. This is not allowed.
return false;
}
- assert(isSameReg(*Src, *getReplacedOperand()) &&
+ MachineOperand &ReplacedOp = *getReplacedOperand();
+ assert(isSameRegOrCopy(*Src, ReplacedOp, getMRI()) &&
(IsPreserveSrc || (SrcSel && SrcMods)));
}
copyRegOperand(*Src, *getTargetOperand());
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
new file mode 100644
index 0000000000000..604e4188525f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=si-peephole-sdwa -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -o - %s | FileCheck -check-prefix=ASM %s
+---
+name: sdwa_reg_sequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; ASM-LABEL: ; %bb.0:
+ ; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; ASM-NEXT: v_add_u32_e32 v1, 10, v0
+ ; ASM-NEXT: v_add_u32_e32 v0, 20, v0
+ ; ASM-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+ ; ASM-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
+ ; ASM-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
+ ; ASM-NEXT: s_endpgm
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE]].sub0, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: .1.entry:
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/133087
More information about the llvm-commits
mailing list