[llvm] [AMDGPU] SIPeepholeSDWA: Add REG_SEQUENCE support (PR #133087)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 09:07:29 PDT 2025
https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/133087
>From 243157eb5f2437c6c8e6f9cecbbdc173a2e8c363 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 25 Mar 2025 10:48:33 -0400
Subject: [PATCH 1/9] [AMDGPU] SIPeepholeSDWA: Add REG_SEQUENCE support
The REG_SEQUENCE instruction represents a copy of several
registers to the subregisters of a target register.
The si-peephole-sdwa pass currently cannot handle
such instructions as operands of machine instructions
that are considered for the conversion to SDWA.
This commit extends the SDWASrcOperand implementation
to allow it to treat the use of a subregister of a
REG_SEQUENCE in an SDWASrcOperand as if the source
register of the copy had been used directly.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 159 ++++++++++++++++--
.../AMDGPU/sdwa-peephole-reg-sequence.mir | 55 ++++++
2 files changed, 198 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..d46d0995b8fa8 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -26,6 +26,9 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
#include <optional>
using namespace llvm;
@@ -278,11 +281,16 @@ static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
}
}
+static bool isSameReg(const MachineOperand &Op, Register Reg) {
+ return Op.isReg() && Op.getReg() == Reg;
+}
+
+static bool isSameReg(const MachineOperand &Op, Register Reg, unsigned SubReg) {
+ return isSameReg(Op, Reg) && Op.getSubReg() == SubReg;
+}
+
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
- return LHS.isReg() &&
- RHS.isReg() &&
- LHS.getReg() == RHS.getReg() &&
- LHS.getSubReg() == RHS.getSubReg();
+ return RHS.isReg() && isSameReg(LHS, RHS.getReg(), RHS.getSubReg());
}
static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
@@ -382,6 +390,97 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
+// The following functions are helpers for dealing with REG_SEQUENCE
+// instructions. Those instructions are used to represent copies to
+// subregisters in SSA form.
+//
+// This pass should be able to peak through REG_SEQUENCE
+// instructions. An access to a subregister of a register defined
+// by a REG_SEQUENCE should be handled as if the register
+// that is being copied to the subregister was accessed.
+// Consider the following example:
+// %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0
+// %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0
+// %3:sreg_32 = S_MOV_B32 255
+// %4:vgpr_32 = V_AND_B32_e64 %2, %3
+// %5:vgpr_32, %6:sreg_64_xexec = V_ADD_CO_U32_e64 %1, %4, 0
+//
+// The V_ADD_CO_U32_e64 instructions will be combined with the
+// V_AND_B32_e64 into an SDWA instruction.
+//
+// If one or more of the operands of V_ADD_CO_U32_e64 are accessed
+// through the subregisters of a REG_SEQUENCE as in the following
+// variation of the previous example, the optimization should still be
+// able to proceed in the same way:
+//
+// [...]
+// %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+// %5:sreg_32 = S_MOV_B32 255
+// %6:vgpr_32 = V_AND_B32_e64 %2, %5
+// %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+// %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0
+//
+// To this end, the SDWASrcOperand implementation uses the following
+// functions to find out the register that is used as the source of
+// the subregister value and it uses this register directly instead of
+// the REG_SEQUENCE subregister.
+
+/// Return the subregister of the REG_SEQUENCE \p RegSeq
+/// which is copied from \p Op, i.e. the operand following
+/// \p Op in the operands of \p RegSeq, or nullopt if the
+/// the \p Op is not an operand of \p RegSeq.
+static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
+ Register Reg) {
+ if (!RegSeq.isRegSequence())
+ return {};
+
+ auto *End = RegSeq.operands_end();
+ // Operand pair at indices (i+1, i+2) is (register, subregister)
+ for (auto *It = RegSeq.operands_begin() + 1; It != End; It += 2) {
+ if (isSameReg(*It, Reg))
+ return (It + 1)->getImm();
+ }
+
+ return {};
+}
+
+/// Return the single use of \p RegSeq which accesses the subregister
+/// that copies from \p Reg. Returns nullptr if \p Reg is not used by
+/// exactly one operand of \p RegSeq.
+static MachineInstr *regSequenceFindSingleSubregUse(MachineInstr &RegSeq,
+ Register Reg,
+ MachineRegisterInfo *MRI) {
+ Register SeqReg = RegSeq.getOperand(0).getReg();
+ unsigned SubReg = *regSequenceFindSubreg(RegSeq, Reg);
+
+ MachineInstr *SingleUse = nullptr;
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(SeqReg))
+ for (auto &Op : UseMI.operands())
+ if (Op.isReg() && Op.getReg() == SeqReg && Op.getSubReg() == SubReg) {
+ if (SingleUse)
+ return nullptr;
+ SingleUse = &UseMI;
+ }
+
+ return SingleUse;
+}
+
+/// If \p MI uses operand \p Reg and \p is defined by a copy-like
+/// instruction (currently, only REG_SEQUENCE is supported), this
+/// returns the instruction which defines the source register of the
+/// copy.
+static MachineInstr *findUseSrc(MachineInstr &MI, MachineOperand &Reg,
+ MachineRegisterInfo *MRI) {
+ assert(Reg.isReg());
+
+ // TODO Handle other copy-like ops?
+ if (!MI.isRegSequence())
+ return &MI;
+
+ MachineInstr *Use = regSequenceFindSingleSubregUse(MI, Reg.getReg(), MRI);
+ return Use;
+}
+
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches) {
@@ -391,12 +490,14 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
if (!Reg->isReg() || !Reg->isDef())
return nullptr;
- for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
- // Check that all instructions that use Reg can be converted
- if (!isConvertibleToSDWA(UseMI, ST, TII) ||
- !canCombineSelections(UseMI, TII))
+ // Check that all instructions that use Reg can be converted
+ for (MachineInstr &UseMI :
+ getMRI()->use_nodbg_instructions(Reg->getReg())) {
+ MachineInstr *SrcMI = findUseSrc(UseMI, *Reg, getMRI());
+ if (!SrcMI || !isConvertibleToSDWA(*SrcMI, ST, TII) ||
+ !canCombineSelections(*SrcMI, TII))
return nullptr;
-
+ }
// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
@@ -404,8 +505,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
assert(isSameReg(UseMO, *Reg));
SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
- MachineInstr *UseMI = UseMO.getParent();
- potentialMatchesMap[UseMI].push_back(this);
+ MachineInstr *UseSrcMI = findUseSrc(*UseMO.getParent(), *Reg, getMRI());
+ potentialMatchesMap[UseSrcMI].push_back(this);
}
return nullptr;
}
@@ -417,10 +518,36 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
return nullptr;
MachineInstr *Parent = PotentialMO->getParent();
+ if (Parent->isRegSequence()) {
+ Parent = regSequenceFindSingleSubregUse(
+ *Parent, getReplacedOperand()->getReg(), getMRI());
+ return Parent && canCombineSelections(*Parent, TII) ? Parent : nullptr;
+ }
return canCombineSelections(*Parent, TII) ? Parent : nullptr;
}
+/// Returns true if \p RHS is either the same register as LHS or the
+/// defining instruction of \p LHS is a REG_SEQUENCE in which \p
+/// RHS occurs as the operand for the register that corresponds to the
+/// subregister of LHS.
+static bool isSameRegOrCopy(const MachineOperand &LHS,
+ const MachineOperand &RHS,
+ const MachineRegisterInfo *MRI) {
+ if (isSameReg(LHS, RHS))
+ return true;
+
+ const MachineOperand *Def = findSingleRegDef(&LHS, MRI);
+ const MachineInstr *MI = Def ? Def->getParent() : nullptr;
+
+ // TODO Handle other copy-like instructions?
+ if (!MI || !MI->isRegSequence())
+ return false;
+
+ auto SubReg = regSequenceFindSubreg(*MI, RHS.getReg());
+ return SubReg && LHS.getSubReg() == SubReg;
+}
+
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
switch (MI.getOpcode()) {
case AMDGPU::V_CVT_F32_FP8_sdwa:
@@ -439,14 +566,13 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MachineOperand *SrcMods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
assert(Src && (Src->isReg() || Src->isImm()));
- if (!isSameReg(*Src, *getReplacedOperand())) {
+ if (!isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// If this is not src0 then it could be src1
Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
- if (!Src ||
- !isSameReg(*Src, *getReplacedOperand())) {
+ if (!Src || !isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// It's possible this Src is a tied operand for
// UNUSED_PRESERVE, in which case we can either
// abandon the peephole attempt, or if legal we can
@@ -486,13 +612,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
- !isSameReg(*Src, *getReplacedOperand())) {
+ !isSameRegOrCopy(*Src, *getReplacedOperand(), getMRI())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
// src2. This is not allowed.
return false;
}
- assert(isSameReg(*Src, *getReplacedOperand()) &&
+ MachineOperand &ReplacedOp = *getReplacedOperand();
+ assert(isSameRegOrCopy(*Src, ReplacedOp, getMRI()) &&
(IsPreserveSrc || (SrcSel && SrcMods)));
}
copyRegOperand(*Src, *getTargetOperand());
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
new file mode 100644
index 0000000000000..604e4188525f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=si-peephole-sdwa -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -o - %s | FileCheck -check-prefix=ASM %s
+---
+name: sdwa_reg_sequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; ASM-LABEL: ; %bb.0:
+ ; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; ASM-NEXT: v_add_u32_e32 v1, 10, v0
+ ; ASM-NEXT: v_add_u32_e32 v0, 20, v0
+ ; ASM-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+ ; ASM-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
+ ; ASM-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
+ ; ASM-NEXT: s_endpgm
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE]].sub0, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: .1.entry:
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
>From d6fbc1abe6a8fb8760aed399becb2064a408ca69 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 1 Apr 2025 05:17:37 -0400
Subject: [PATCH 2/9] Add MachineRegisterInfo::getOneNonDBGUse{,r}
---
llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 13 ++++++++++---
llvm/lib/CodeGen/MachineRegisterInfo.cpp | 11 +++++++++++
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 1c465741cb462..ee4ab29586bca 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -587,16 +587,23 @@ class MachineRegisterInfo {
/// use of the specified register.
bool hasOneNonDBGUse(Register RegNo) const;
- /// hasOneNonDBGUse - Return true if there is exactly one non-Debug
+ /// hasOneNonDBGUser - Return true if there is exactly one non-Debug
/// instruction using the specified register. Said instruction may have
/// multiple uses.
bool hasOneNonDBGUser(Register RegNo) const;
-
- /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
+ /// hasAtMostUserInstrs - Return true if the given register has at most \p MaxUsers
/// non-debug user instructions.
bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const;
+ /// getOneNonDBGUse - Return the unique non-Debug use of \p RegNo,
+ /// or nullptr if the number of such operands is unequal to one.
+ MachineOperand *getOneNonDBGUse(Register RegNo) const;
+
+ /// getOneNonDBGUser - Return the unique non-Debug instruction using \p RegNo
+ /// or nullptr if the number of such instructions is unequal to one.
+ MachineInstr *getOneNonDBGUser(Register RegNo) const;
+
/// replaceRegWith - Replace all instances of FromReg with ToReg in the
/// machine function. This is like llvm-level X->replaceAllUsesWith(Y),
/// except that it also changes any definitions of the register as well.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 937f63f6c5e00..8936358ee67d1 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -438,6 +438,17 @@ bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
MaxUsers);
}
+MachineOperand *MachineRegisterInfo::getOneNonDBGUse(Register RegNo) const {
+ auto RegNoDbgOperands = use_nodbg_operands(RegNo);
+ return hasSingleElement(RegNoDbgOperands) ? &*RegNoDbgOperands.begin()
+ : nullptr;
+}
+
+MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+ auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
+ return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
+}
+
/// clearKillFlags - Iterate over all the uses of the given register and
/// clear the kill flag from the MachineOperand. This function is used by
/// optimization passes which extend register lifetimes and need only
>From 856edd54d8b13d5e6fb9901e23c14fcc9b3168ce Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 1 Apr 2025 10:03:06 -0400
Subject: [PATCH 3/9] Review changes
- Revert changes to isSameReg function (not necessary)
- Use new MRI->getOneNonDBGUser instead of checking for single use
manually
- Remove findUseSrc
- Split test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir into two
tests and fix RUN line and checks.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 78 +++++++------------
...e.mir => sdwa-peephole-reg-sequence-1.mir} | 14 +---
.../AMDGPU/sdwa-peephole-reg-sequence-2.mir | 33 ++++++++
3 files changed, 65 insertions(+), 60 deletions(-)
rename llvm/test/CodeGen/AMDGPU/{sdwa-peephole-reg-sequence.mir => sdwa-peephole-reg-sequence-1.mir} (80%)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d46d0995b8fa8..d18a0ff671ab8 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -281,16 +281,9 @@ static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
}
}
-static bool isSameReg(const MachineOperand &Op, Register Reg) {
- return Op.isReg() && Op.getReg() == Reg;
-}
-
-static bool isSameReg(const MachineOperand &Op, Register Reg, unsigned SubReg) {
- return isSameReg(Op, Reg) && Op.getSubReg() == SubReg;
-}
-
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
- return RHS.isReg() && isSameReg(LHS, RHS.getReg(), RHS.getSubReg());
+ return LHS.isReg() && RHS.isReg() && LHS.getReg() == RHS.getReg() &&
+ LHS.getSubReg() == RHS.getSubReg();
}
static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
@@ -437,48 +430,29 @@ static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
auto *End = RegSeq.operands_end();
// Operand pair at indices (i+1, i+2) is (register, subregister)
for (auto *It = RegSeq.operands_begin() + 1; It != End; It += 2) {
- if (isSameReg(*It, Reg))
+ if (It->getReg() == Reg)
return (It + 1)->getImm();
}
return {};
}
-/// Return the single use of \p RegSeq which accesses the subregister
+/// Return the single user of \p RegSeq which accesses the subregister
/// that copies from \p Reg. Returns nullptr if \p Reg is not used by
/// exactly one operand of \p RegSeq.
-static MachineInstr *regSequenceFindSingleSubregUse(MachineInstr &RegSeq,
- Register Reg,
- MachineRegisterInfo *MRI) {
+static MachineInstr *regSequenceFindSingleSubregUser(MachineInstr &RegSeq,
+ Register Reg,
+ MachineRegisterInfo *MRI) {
Register SeqReg = RegSeq.getOperand(0).getReg();
unsigned SubReg = *regSequenceFindSubreg(RegSeq, Reg);
- MachineInstr *SingleUse = nullptr;
- for (MachineInstr &UseMI : MRI->use_nodbg_instructions(SeqReg))
- for (auto &Op : UseMI.operands())
- if (Op.isReg() && Op.getReg() == SeqReg && Op.getSubReg() == SubReg) {
- if (SingleUse)
- return nullptr;
- SingleUse = &UseMI;
- }
-
- return SingleUse;
-}
-
-/// If \p MI uses operand \p Reg and \p is defined by a copy-like
-/// instruction (currently, only REG_SEQUENCE is supported), this
-/// returns the instruction which defines the source register of the
-/// copy.
-static MachineInstr *findUseSrc(MachineInstr &MI, MachineOperand &Reg,
- MachineRegisterInfo *MRI) {
- assert(Reg.isReg());
+ MachineInstr *User = MRI->getOneNonDBGUser(SeqReg);
+ if (User)
+ for (auto &Op : User->operands())
+ if (Op.isReg() && Op.getReg() == SeqReg && Op.getSubReg() == SubReg)
+ return User;
- // TODO Handle other copy-like ops?
- if (!MI.isRegSequence())
- return &MI;
-
- MachineInstr *Use = regSequenceFindSingleSubregUse(MI, Reg.getReg(), MRI);
- return Use;
+ return nullptr;
}
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
@@ -486,26 +460,34 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
SDWAOperandsMap *PotentialMatches) {
if (PotentialMatches != nullptr) {
// Fill out the map for all uses if all can be converted
- MachineOperand *Reg = getReplacedOperand();
- if (!Reg->isReg() || !Reg->isDef())
+ MachineOperand *Op = getReplacedOperand();
+ if (!Op->isReg() || !Op->isDef())
return nullptr;
+ Register Reg = Op->getReg();
+ MachineRegisterInfo *MRI = getMRI();
// Check that all instructions that use Reg can be converted
- for (MachineInstr &UseMI :
- getMRI()->use_nodbg_instructions(Reg->getReg())) {
- MachineInstr *SrcMI = findUseSrc(UseMI, *Reg, getMRI());
+ for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
+ MachineInstr *SrcMI =
+ UseMI.isRegSequence()
+ ? regSequenceFindSingleSubregUser(UseMI, Reg, MRI)
+ : &UseMI;
if (!SrcMI || !isConvertibleToSDWA(*SrcMI, ST, TII) ||
!canCombineSelections(*SrcMI, TII))
return nullptr;
}
// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
- for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg)) {
// Should not get a subregister here
- assert(isSameReg(UseMO, *Reg));
+ assert(isSameReg(UseMO, *Op));
SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
- MachineInstr *UseSrcMI = findUseSrc(*UseMO.getParent(), *Reg, getMRI());
+ MachineInstr *Parent = UseMO.getParent();
+ MachineInstr *UseSrcMI =
+ Parent->isRegSequence()
+ ? regSequenceFindSingleSubregUser(*Parent, Reg, MRI)
+ : Parent;
potentialMatchesMap[UseSrcMI].push_back(this);
}
return nullptr;
@@ -519,7 +501,7 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
MachineInstr *Parent = PotentialMO->getParent();
if (Parent->isRegSequence()) {
- Parent = regSequenceFindSingleSubregUse(
+ Parent = regSequenceFindSingleSubregUser(
*Parent, getReplacedOperand()->getReg(), getMRI());
return Parent && canCombineSelections(*Parent, TII) ? Parent : nullptr;
}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
similarity index 80%
rename from llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
rename to llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
index 604e4188525f1..40676a03554ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=si-peephole-sdwa -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -o - %s | FileCheck -check-prefix=ASM %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -stop-after=si-peephole-sdwa -o - %s | FileCheck %s
+
---
name: sdwa_reg_sequence
tracksRegLiveness: true
@@ -8,14 +8,6 @@ body: |
bb.0:
liveins: $vgpr0
- ; ASM-LABEL: ; %bb.0:
- ; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; ASM-NEXT: v_add_u32_e32 v1, 10, v0
- ; ASM-NEXT: v_add_u32_e32 v0, 20, v0
- ; ASM-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
- ; ASM-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
- ; ASM-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
- ; ASM-NEXT: s_endpgm
; CHECK-LABEL: name: sdwa_reg_sequence
; CHECK: liveins: $vgpr0
@@ -35,8 +27,6 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: .1.entry:
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
%2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
new file mode 100644
index 0000000000000..b3de7e5d0cf00
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
@@ -0,0 +1,33 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -o - %s | FileCheck %s
+
+---
+name: sdwa_reg_sequence
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: v_add_u32_e32 v1, 10, v0
+ ; CHECK-NEXT: v_add_u32_e32 v0, 20, v0
+ ; CHECK-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+ ; CHECK-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
+ ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
+ ; CHECK-NEXT: s_endpgm
+ bb.0:
+ liveins: $vgpr0
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
>From f13cfb3efbde5ab0aba9936e1f9dad60201db7fa Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 1 Apr 2025 10:12:05 -0400
Subject: [PATCH 4/9] fixup! Add MachineRegisterInfo::getOneNonDBGUse{,r}
---
llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index ee4ab29586bca..e73c6ae8aaa96 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -592,8 +592,8 @@ class MachineRegisterInfo {
/// multiple uses.
bool hasOneNonDBGUser(Register RegNo) const;
- /// hasAtMostUserInstrs - Return true if the given register has at most \p MaxUsers
- /// non-debug user instructions.
+ /// hasAtMostUserInstrs - Return true if the given register has at most \p
+ /// MaxUsers non-debug user instructions.
bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const;
/// getOneNonDBGUse - Return the unique non-Debug use of \p RegNo,
>From ca31d029fd192735b334d51332462830744d9eca Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 10 Apr 2025 14:22:12 -0400
Subject: [PATCH 5/9] Simplify SDWASrcOperand::potentialToConvert
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 19 +++++++------------
1 file changed, 7 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d18a0ff671ab8..acbdeefeeec17 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -467,6 +467,7 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
MachineRegisterInfo *MRI = getMRI();
// Check that all instructions that use Reg can be converted
+ SmallVector<MachineInstr *, 4> Uses;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
MachineInstr *SrcMI =
UseMI.isRegSequence()
@@ -475,21 +476,15 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
if (!SrcMI || !isConvertibleToSDWA(*SrcMI, ST, TII) ||
!canCombineSelections(*SrcMI, TII))
return nullptr;
+
+ Uses.push_back(SrcMI);
}
// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
- for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg)) {
- // Should not get a subregister here
- assert(isSameReg(UseMO, *Op));
-
- SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
- MachineInstr *Parent = UseMO.getParent();
- MachineInstr *UseSrcMI =
- Parent->isRegSequence()
- ? regSequenceFindSingleSubregUser(*Parent, Reg, MRI)
- : Parent;
- potentialMatchesMap[UseSrcMI].push_back(this);
- }
+ auto &PM = *PotentialMatches;
+ for (auto *Use : Uses)
+ PM[Use].push_back(this);
+
return nullptr;
}
>From 59bee7a3abc1ff4a0b908aa129a3fc2793b0a13b Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 11 Aug 2025 09:08:05 -0400
Subject: [PATCH 6/9] Add new test for multiple reg uses and move all tests
---
.../AMDGPU/sdwa-peephole-reg-sequence-1.mir | 45 ------
.../AMDGPU/sdwa-peephole-reg-sequence-2.mir | 33 -----
.../AMDGPU/sdwa-peephole-reg-sequence.mir | 133 ++++++++++++++++++
3 files changed, 133 insertions(+), 78 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
delete mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
deleted file mode 100644
index 40676a03554ae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-1.mir
+++ /dev/null
@@ -1,45 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -stop-after=si-peephole-sdwa -o - %s | FileCheck %s
-
----
-name: sdwa_reg_sequence
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0
-
-
- ; CHECK-LABEL: name: sdwa_reg_sequence
- ; CHECK: liveins: $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
- ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
- ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
- ; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE]].sub0, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
- ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
- ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
- %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
- %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
- %5:sreg_32 = S_MOV_B32 255
- %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
- %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
- %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
- %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
- %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
- %13:sreg_64 = IMPLICIT_DEF
- %14:vreg_64 = COPY %13
- GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
- S_ENDPGM 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
deleted file mode 100644
index b3de7e5d0cf00..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence-2.mir
+++ /dev/null
@@ -1,33 +0,0 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=si-peephole-sdwa -o - %s | FileCheck %s
-
----
-name: sdwa_reg_sequence
-tracksRegLiveness: true
-body: |
- ; CHECK-LABEL: ; %bb.0:
- ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; CHECK-NEXT: v_add_u32_e32 v1, 10, v0
- ; CHECK-NEXT: v_add_u32_e32 v0, 20, v0
- ; CHECK-NEXT: v_add_co_u32_sdwa v0, vcc, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
- ; CHECK-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, 0, vcc
- ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
- ; CHECK-NEXT: s_endpgm
- bb.0:
- liveins: $vgpr0
-
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
- %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
- %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
- %5:sreg_32 = S_MOV_B32 255
- %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
- %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
- %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
- %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
- %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
- %13:sreg_64 = IMPLICIT_DEF
- %14:vreg_64 = COPY %13
- GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
- S_ENDPGM 0
-...
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
new file mode 100644
index 0000000000000..c212771cd273a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
@@ -0,0 +1,133 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-peephole-sdwa < %s | FileCheck %s
+
+---
+name: sdwa_reg_sequence
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE]].sub0, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+---
+name: sdwa_reg_sequence_composed_subregs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence_composed_subregs
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub0, 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]].sub1, 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[REG_SEQUENCE]].sub1, %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_sdwa 0, [[REG_SEQUENCE1]].sub1, 0, [[V_ADD_U32_e64_1]], 0, 6, 0, 6, 0, implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, $vcc, 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_sdwa]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vreg_64 = COPY $vgpr1_vgpr2
+ %1:vgpr_32 = V_ADD_U32_e64 %0.sub0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0.sub1, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:vreg_64 = REG_SEQUENCE %0.sub0, %subreg.sub0, %4.sub1, %subreg.sub1
+ %6:sreg_32 = S_MOV_B32 255
+ %7:vgpr_32 = V_AND_B32_e64 killed %2, killed %6, implicit $exec
+ %8:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, %3, %subreg.sub1
+ %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub1, %8.sub0, 0, implicit $exec
+ %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %10, 0, implicit $exec
+ %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
+ %15:vreg_64 = COPY %13
+ GLOBAL_STORE_DWORDX2 killed %15, killed %13, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
+
+
+---
+name: sdwa_reg_sequence_multiple_uses
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+
+ ; CHECK-LABEL: name: sdwa_reg_sequence_multiple_uses
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 10, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], 20, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_U32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_ADD_U32_e64_1]], killed [[S_MOV_B32_]], implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, [[REG_SEQUENCE1]].sub0, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 killed [[COPY1]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_ADD_U32_e64 %0, 10, 0, implicit $exec
+ %2:vgpr_32 = V_ADD_U32_e64 %0, 20, 0, implicit $exec
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %4:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %3, %subreg.sub1
+ %5:sreg_32 = S_MOV_B32 255
+ %6:vgpr_32 = V_AND_B32_e64 killed %2, killed %5, implicit $exec
+ %7:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %3, %subreg.sub1
+ %8:vgpr_32, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %4.sub0, %7.sub0, 0, implicit $exec
+ %10:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 0, 0, killed %9, 0, implicit $exec
+ %12:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %10, %subreg.sub1
+ %13:sreg_64 = IMPLICIT_DEF
+ %14:vreg_64 = COPY %13
+ %15:vgpr_32 = COPY %6
+ GLOBAL_STORE_DWORDX2 killed %14, killed %12, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ S_ENDPGM 0
+...
>From ef4f797f04000bbe2e1326bb35c4c11b52d95249 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 11 Aug 2025 09:42:49 -0400
Subject: [PATCH 7/9] Add some comments
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1b19f36f52db7..18602aa80d082 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -423,6 +423,10 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
/// which is copied from \p Op, i.e. the operand following
/// \p Op in the operands of \p RegSeq, or nullopt if the
/// the \p Op is not an operand of \p RegSeq.
+///
+/// Example:
+/// For the instruction REG_SEQUENCE %1, %subreg.sub0, %2, %subreg.sub1,
+/// return %subreg.sub0 for \p Reg = %1 and %subreg.sub1 for \p Reg = %2.
static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
Register Reg) {
if (!RegSeq.isRegSequence())
@@ -441,6 +445,18 @@ static std::optional<unsigned> regSequenceFindSubreg(const MachineInstr &RegSeq,
/// Return the single user of \p RegSeq which accesses the subregister
/// that copies from \p Reg. Returns nullptr if \p Reg is not used by
/// exactly one operand of \p RegSeq.
+///
+/// Example:
+/// %0:vgpr_32 = IMPLICIT_DEF
+/// %1:vpgr_32 = IMPLICIT_DEF
+/// %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+/// %3:vgpr_32, %4:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, 2, 0, implicit
+////
+/// [...]
+///
+/// If \p RegSeq is the MI defining %2 and \p Reg = %0, the function
+/// returns %3, provided that %2 has no other uses. For any other
+/// register, it returns nullptr.
static MachineInstr *regSequenceFindSingleSubregUser(MachineInstr &RegSeq,
Register Reg,
MachineRegisterInfo *MRI) {
@@ -470,6 +486,10 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
// Check that all instructions that use Reg can be converted
SmallVector<MachineInstr *, 4> Uses;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
+ // Allow for indirect uses through REG_SEQUENCE instructions:
+ // consider the user (which is assumed to be unique) of the
+ // subregister defined by Reg in UseMI as the user of Reg
+ // instead of UseMi if UseMI is a REG_SEQUENCE.
MachineInstr *SrcMI =
UseMI.isRegSequence()
? regSequenceFindSingleSubregUser(UseMI, Reg, MRI)
>From 90c23d989c1f38f0702becf22a8a14347a0e1eee Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 11 Aug 2025 11:39:37 -0400
Subject: [PATCH 8/9] Remove unused includes
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 18602aa80d082..71217e2c58b39 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -26,9 +26,6 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
#include <optional>
using namespace llvm;
>From 973c536f29cc59d03979412c2e922bcf584987e9 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 11 Aug 2025 11:49:14 -0400
Subject: [PATCH 9/9] fixup! Add new test for multiple reg uses and move all
tests
---
llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
index c212771cd273a..b3a4418830daf 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-reg-sequence.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-peephole-sdwa < %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck %s
---
name: sdwa_reg_sequence
More information about the llvm-commits
mailing list