[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (PR #102130)
Petar Avramovic via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Aug 6 05:07:49 PDT 2024
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/102130
isExtractHiElt should return new source register instead of returning
instruction that defines it. Src = MI.getOperand(0).getReg() is not
correct when MI(for example G_UNMERGE_VALUES) defines multiple registers.
Refactor existing code to work with source registers only.
>From b6c03d1785bd713344c6b578869a0b36fc6473e3 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Tue, 6 Aug 2024 13:50:35 +0200
Subject: [PATCH] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix
isExtractHiElt should return new source register instead of returning
instruction that defines it. Src = MI.getOperand(0).getReg() is not
correct when MI(for example G_UNMERGE_VALUES) defines multiple registers.
Refactor existing code to work with source registers only.
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 164 ++++++++----------
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +-
.../GlobalISel/combine-fma-add-ext-fma.ll | 8 +-
3 files changed, 74 insertions(+), 100 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 73f3921b2ff4c..f78699f88de56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
- auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
- auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
+ auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
+ auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
@@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
+static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
+ return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
+}
+
+static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
+ Register BitcastSrc;
+ if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
+ Reg = BitcastSrc;
+ return Reg;
+}
+
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
+ Register Trunc;
+ if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
+ return false;
+
Register LShlSrc;
- if (mi_match(In, MRI,
- m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
- Out = LShlSrc;
+ Register Cst;
+ if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
+ Cst = stripCopy(Cst, MRI);
+ if (mi_match(Cst, MRI, m_SpecificICst(16))) {
+ Out = stripBitCast(LShlSrc, MRI);
+ return true;
+ }
+ }
+
+ MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
+ if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
+ return false;
+
+ assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
+ LLT::fixed_vector(2, 16));
+
+ ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
+ assert(Mask.size() == 2);
+
+ if (Mask[0] == 1 && Mask[1] <= 1) {
+ Out = Shuffle->getOperand(0).getReg();
return true;
}
+
return false;
}
@@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool IsCanonicalizing,
- bool AllowAbs, bool OpSel) const {
- Register Src = Root.getReg();
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
@@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
@@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
- /*AllowAbs=*/false);
+ std::tie(Src, Mods) =
+ selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
// FIXME: Handle op_sel
return {{
@@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
@@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
@@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
-// Variant of stripBitCast that returns the instruction instead of a
-// MachineOperand.
-static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
- if (MI->getOpcode() == AMDGPU::G_BITCAST)
- return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
- return MI;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword,
-// returns nullptr if it isn't.
-static MachineInstr *isExtractHiElt(MachineInstr *Inst,
- MachineRegisterInfo &MRI) {
- Inst = stripBitCast(Inst, MRI);
-
- if (Inst->getOpcode() != AMDGPU::G_TRUNC)
- return nullptr;
-
- MachineInstr *TruncOp =
- getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
- TruncOp = stripBitCast(TruncOp, MRI);
-
- // G_LSHR x, (G_CONSTANT i32 16)
- if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
- auto SrlAmount = getIConstantVRegValWithLookThrough(
- TruncOp->getOperand(2).getReg(), MRI);
- if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
- MachineInstr *SrlOp =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(SrlOp, MRI);
- }
- }
-
- // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
- // 1, 0 swaps the low/high 16 bits.
- // 1, 1 sets the high 16 bits to be the same as the low 16.
- // in any case, it selects the high elts.
- if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
- assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
- LLT::fixed_vector(2, 16));
-
- ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
- assert(Mask.size() == 2);
-
- if (Mask[0] == 1 && Mask[1] <= 1) {
- MachineInstr *LHS =
- getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
- return stripBitCast(LHS, MRI);
- }
- }
-
- return nullptr;
-}
-
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
@@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
-
- MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
- if (MI->getOpcode() == AMDGPU::G_FPEXT) {
- MachineOperand *MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));
- // See through bitcasts.
- // FIXME: Would be nice to use stripBitCast here.
- if (MI->getOpcode() == AMDGPU::G_BITCAST) {
- MO = &MI->getOperand(1);
- Src = MO->getReg();
- MI = getDefIgnoringCopies(Src, *MRI);
- }
+ // Only change Src if src modifier could be gained. In such cases new Src
+ // could be sgpr but this does not violate constant bus restriction for
+ // instruction that is being selected.
+ // Note: Src is not changed when there is only a simple sgpr to vgpr copy
+ // since this could violate constant bus restriction.
+ Register PeekSrc = stripCopy(Src, *MRI);
const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
- std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
- MI = getDefIgnoringCopies(Src, *MRI);
+ std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);
- if ((ModsTmp & SISrcMods::NEG) != 0)
+ if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
+ Src = PeekSrc;
+ }
- if ((ModsTmp & SISrcMods::ABS) != 0)
+ if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
+ Src = PeekSrc;
+ }
}
};
@@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
Mods |= SISrcMods::OP_SEL_1;
- if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
+ if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
+ Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
- MI = ExtractHiEltMI;
- MO = &MI->getOperand(0);
- Src = MO->getReg();
-
CheckAbsNeg();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 7fff7d2685e7f..69806b240cf2b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
- std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+ std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
index e910c2eca2ced..b2b433167fe4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
@@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_matching_source_from_unmerge:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16
More information about the llvm-branch-commits
mailing list