[llvm] r319662 - [AMDGPU] SDWA: add support for PRESERVE into SDWA peephole.
Sam Kolton via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 4 08:22:32 PST 2017
Author: skolton
Date: Mon Dec 4 08:22:32 2017
New Revision: 319662
URL: http://llvm.org/viewvc/llvm-project?rev=319662&view=rev
Log:
[AMDGPU] SDWA: add support for PRESERVE into SDWA peephole.
Summary:
Reviewers: arsenm, vpykhtin, rampitec
Subscribers: kzhuravl, wdng, nhaehnle, mgorny, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D37817
Added:
llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Mon Dec 4 08:22:32 2017
@@ -2687,6 +2687,28 @@ bool SIInstrInfo::verifyInstruction(cons
}
}
}
+
+ const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused && DstUnused->isImm() &&
+ DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
+ const MachineOperand &Dst = MI.getOperand(DstIdx);
+ if (!Dst.isReg() || !Dst.isTied()) {
+ ErrInfo = "Dst register should have tied register";
+ return false;
+ }
+
+ const MachineOperand &TiedMO =
+ MI.getOperand(MI.findTiedOperandIdx(DstIdx));
+ if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
+ ErrInfo =
+ "Dst register should be tied to implicit use of preserved register";
+ return false;
+ } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
+ Dst.getReg() != TiedMO.getReg()) {
+ ErrInfo = "Dst register should use same physical register as preserved";
+ return false;
+ }
+ }
}
// Verify VOP*
Modified: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp Mon Dec 4 08:22:32 2017
@@ -61,6 +61,7 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
class SDWAOperand;
+class SDWADstOperand;
class SIPeepholeSDWA : public MachineFunctionPass {
public:
@@ -86,6 +87,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
+ std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
@@ -122,6 +124,11 @@ public:
MachineRegisterInfo *getMRI() const {
return &getParentInst()->getParent()->getParent()->getRegInfo();
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ virtual void print(raw_ostream& OS) const = 0;
+ void dump() const { print(dbgs()); }
+#endif
};
using namespace AMDGPU::SDWA;
@@ -137,8 +144,8 @@ public:
SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
bool Sext_ = false)
- : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
- Neg(Neg_), Sext(Sext_) {}
+ : SDWAOperand(TargetOp, ReplacedOp),
+ SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
@@ -150,6 +157,10 @@ public:
uint64_t getSrcMods(const SIInstrInfo *TII,
const MachineOperand *SrcOp) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
};
class SDWADstOperand : public SDWAOperand {
@@ -158,15 +169,39 @@ private:
DstUnused DstUn;
public:
+
SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
- : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+ : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
DstUnused getDstUnused() const { return DstUn; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
+};
+
+class SDWADstPreserveOperand : public SDWADstOperand {
+private:
+ MachineOperand *Preserve;
+
+public:
+ SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+ MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
+ : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
+ Preserve(PreserveOp) {}
+
+ bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+ MachineOperand *getPreservedOperand() const { return Preserve; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream& OS) const override;
+#endif
};
} // end anonymous namespace
@@ -181,7 +216,8 @@ FunctionPass *llvm::createSIPeepholeSDWA
return new SIPeepholeSDWA();
}
-#ifndef NDEBUG
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
switch(Sel) {
case BYTE_0: OS << "BYTE_0"; break;
@@ -204,20 +240,33 @@ static raw_ostream& operator<<(raw_ostre
return OS;
}
-static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
- OS << "SDWA src: " << *Src.getTargetOperand()
- << " src_sel:" << Src.getSrcSel()
- << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
- << " sext:" << Src.getSext() << '\n';
+static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
+ Operand.print(OS);
return OS;
}
-static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
- OS << "SDWA dst: " << *Dst.getTargetOperand()
- << " dst_sel:" << Dst.getDstSel()
- << " dst_unused:" << Dst.getDstUnused() << '\n';
- return OS;
+LLVM_DUMP_METHOD
+void SDWASrcOperand::print(raw_ostream& OS) const {
+ OS << "SDWA src: " << *getTargetOperand()
+ << " src_sel:" << getSrcSel()
+ << " abs:" << getAbs() << " neg:" << getNeg()
+ << " sext:" << getSext() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void SDWADstOperand::print(raw_ostream& OS) const {
+ OS << "SDWA dst: " << *getTargetOperand()
+ << " dst_sel:" << getDstSel()
+ << " dst_unused:" << getDstUnused() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void SDWADstPreserveOperand::print(raw_ostream& OS) const {
+ OS << "SDWA preserve dst: " << *getTargetOperand()
+ << " dst_sel:" << getDstSel()
+ << " preserve:" << *getPreservedOperand() << '\n';
}
+
#endif
static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
@@ -239,23 +288,43 @@ static bool isSameReg(const MachineOpera
LHS.getSubReg() == RHS.getSubReg();
}
-static bool isSubregOf(const MachineOperand &SubReg,
- const MachineOperand &SuperReg,
- const TargetRegisterInfo *TRI) {
+static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
+ const MachineRegisterInfo *MRI) {
+ if (!Reg->isReg() || !Reg->isDef())
+ return nullptr;
+
+ MachineOperand *ResMO = nullptr;
+ for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
+ // If there exist use of subreg of Reg then return nullptr
+ if (!isSameReg(UseMO, *Reg))
+ return nullptr;
- if (!SuperReg.isReg() || !SubReg.isReg())
- return false;
+ // Check that there is only one instruction that uses Reg
+ if (!ResMO) {
+ ResMO = &UseMO;
+ } else if (ResMO->getParent() != UseMO.getParent()) {
+ return nullptr;
+ }
+ }
- if (isSameReg(SuperReg, SubReg))
- return true;
+ return ResMO;
+}
- if (SuperReg.getReg() != SubReg.getReg())
- return false;
+static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
+ const MachineRegisterInfo *MRI) {
+ if (!Reg->isReg())
+ return nullptr;
- LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
- LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
- SuperMask |= ~SubMask;
- return SuperMask.all();
+ MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
+ if (!DefInstr)
+ return nullptr;
+
+ for (auto &DefMO : DefInstr->defs()) {
+ if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
+ return &DefMO;
+ }
+
+ llvm_unreachable("invalid reg");
}
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
@@ -286,30 +355,11 @@ uint64_t SDWASrcOperand::getSrcMods(cons
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
- MachineRegisterInfo *MRI = getMRI();
- MachineOperand *Replaced = getReplacedOperand();
- assert(Replaced->isReg());
-
- MachineInstr *PotentialMI = nullptr;
- for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
- // If this is use of another subreg of dst reg then do nothing
- if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
- continue;
-
- // If there exist use of superreg of dst then we should not combine this
- // opernad
- if (!isSameReg(PotentialMO, *Replaced))
- return nullptr;
+ MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
+ if (!PotentialMO)
+ return nullptr;
- // Check that PotentialMI is only instruction that uses dst reg
- if (PotentialMI == nullptr) {
- PotentialMI = PotentialMO.getParent();
- } else if (PotentialMI != PotentialMO.getParent()) {
- return nullptr;
- }
- }
-
- return PotentialMI;
+ return PotentialMO->getParent();
}
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -331,7 +381,7 @@ bool SDWASrcOperand::convertToSDWA(Machi
if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
- !isSameReg(*Src, *getReplacedOperand())) {
+ !isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
// src2. This is not allowed.
return false;
@@ -351,29 +401,18 @@ MachineInstr *SDWADstOperand::potentialT
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
MachineInstr *ParentMI = getParentInst();
- MachineOperand *Replaced = getReplacedOperand();
- assert(Replaced->isReg());
- for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
- if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
- continue;
-
- if (!isSameReg(*Replaced, PotentialMO))
+ MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
+ if (!PotentialMO)
+ return nullptr;
+
+ // Check that ParentMI is the only instruction that uses replaced register
+ for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
+ if (&UseInst != ParentMI)
return nullptr;
-
- // Check that ParentMI is the only instruction that uses replaced register
- for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
- if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
- UseMO.getParent() != ParentMI) {
- return nullptr;
- }
- }
-
- // Due to SSA this should be onle def of replaced register, so return it
- return PotentialMO.getParent();
}
- return nullptr;
+ return PotentialMO->getParent();
}
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -404,6 +443,36 @@ bool SDWADstOperand::convertToSDWA(Machi
return true;
}
+bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ // MI should be moved right before v_or_b32.
+ // For this we should clear all kill flags on uses of MI src-operands or else
+ // we can encounter problem with use of killed operand.
+ for (MachineOperand &MO : MI.uses()) {
+ if (!MO.isReg())
+ continue;
+ getMRI()->clearKillFlags(MO.getReg());
+ }
+
+ // Move MI before v_or_b32
+ auto MBB = MI.getParent();
+ MBB->remove(&MI);
+ MBB->insert(getParentInst(), &MI);
+
+ // Add Implicit use of preserved register
+ MachineInstrBuilder MIB(*MBB->getParent(), MI);
+ MIB.addReg(getPreservedOperand()->getReg(),
+ RegState::ImplicitKill,
+ getPreservedOperand()->getSubReg());
+
+ // Tie dst to implicit use
+ MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
+ MI.getNumOperands() - 1);
+
+ // Convert MI as any other SDWADstOperand and remove v_or_b32
+ return SDWADstOperand::convertToSDWA(MI, TII);
+}
+
Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
if (Op.isImm()) {
return Op.getImm();
@@ -431,195 +500,316 @@ Optional<int64_t> SIPeepholeSDWA::foldTo
return None;
}
-void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- case AMDGPU::V_LSHRREV_B32_e32:
- case AMDGPU::V_ASHRREV_I32_e32:
- case AMDGPU::V_LSHLREV_B32_e32:
- case AMDGPU::V_LSHRREV_B32_e64:
- case AMDGPU::V_ASHRREV_I32_e64:
- case AMDGPU::V_LSHLREV_B32_e64: {
- // from: v_lshrrev_b32_e32 v1, 16/24, v0
- // to SDWA src:v0 src_sel:WORD_1/BYTE_3
-
- // from: v_ashrrev_i32_e32 v1, 16/24, v0
- // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
-
- // from: v_lshlrev_b32_e32 v1, 16/24, v0
- // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- auto Imm = foldToImm(*Src0);
- if (!Imm)
- break;
-
- if (*Imm != 16 && *Imm != 24)
- break;
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
- Opcode == AMDGPU::V_LSHLREV_B32_e64) {
- auto SDWADst = make_unique<SDWADstOperand>(
- Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
- SDWAOperands[&MI] = std::move(SDWADst);
- ++NumSDWAPatternsFound;
- } else {
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
- Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
- Opcode != AMDGPU::V_LSHRREV_B32_e64);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
- }
- break;
- }
+std::unique_ptr<SDWAOperand>
+SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_LSHRREV_B32_e32:
+ case AMDGPU::V_ASHRREV_I32_e32:
+ case AMDGPU::V_LSHLREV_B32_e32:
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_LSHLREV_B32_e64: {
+ // from: v_lshrrev_b32_e32 v1, 16/24, v0
+ // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+ // from: v_ashrrev_i32_e32 v1, 16/24, v0
+ // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+ // from: v_lshlrev_b32_e32 v1, 16/24, v0
+ // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ auto Imm = foldToImm(*Src0);
+ if (!Imm)
+ break;
+
+ if (*Imm != 16 && *Imm != 24)
+ break;
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B32_e64) {
+ return make_unique<SDWADstOperand>(
+ Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+ } else {
+ return make_unique<SDWASrcOperand>(
+ Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
+ Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B32_e64);
+ }
+ break;
+ }
- case AMDGPU::V_LSHRREV_B16_e32:
- case AMDGPU::V_ASHRREV_I16_e32:
- case AMDGPU::V_LSHLREV_B16_e32:
- case AMDGPU::V_LSHRREV_B16_e64:
- case AMDGPU::V_ASHRREV_I16_e64:
- case AMDGPU::V_LSHLREV_B16_e64: {
- // from: v_lshrrev_b16_e32 v1, 8, v0
- // to SDWA src:v0 src_sel:BYTE_1
-
- // from: v_ashrrev_i16_e32 v1, 8, v0
- // to SDWA src:v0 src_sel:BYTE_1 sext:1
-
- // from: v_lshlrev_b16_e32 v1, 8, v0
- // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- auto Imm = foldToImm(*Src0);
- if (!Imm || *Imm != 8)
- break;
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
- Opcode == AMDGPU::V_LSHLREV_B16_e64) {
- auto SDWADst =
- make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
- SDWAOperands[&MI] = std::move(SDWADst);
- ++NumSDWAPatternsFound;
- } else {
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, BYTE_1, false, false,
- Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
- Opcode != AMDGPU::V_LSHRREV_B16_e64);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
- }
- break;
- }
+ case AMDGPU::V_LSHRREV_B16_e32:
+ case AMDGPU::V_ASHRREV_I16_e32:
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_LSHLREV_B16_e64: {
+ // from: v_lshrrev_b16_e32 v1, 8, v0
+ // to SDWA src:v0 src_sel:BYTE_1
+
+ // from: v_ashrrev_i16_e32 v1, 8, v0
+ // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+ // from: v_lshlrev_b16_e32 v1, 8, v0
+ // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ auto Imm = foldToImm(*Src0);
+ if (!Imm || *Imm != 8)
+ break;
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B16_e64) {
+ return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ } else {
+ return make_unique<SDWASrcOperand>(
+ Src1, Dst, BYTE_1, false, false,
+ Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B16_e64);
+ }
+ break;
+ }
- case AMDGPU::V_BFE_I32:
- case AMDGPU::V_BFE_U32: {
- // e.g.:
- // from: v_bfe_u32 v1, v0, 8, 8
- // to SDWA src:v0 src_sel:BYTE_1
-
- // offset | width | src_sel
- // ------------------------
- // 0 | 8 | BYTE_0
- // 0 | 16 | WORD_0
- // 0 | 32 | DWORD ?
- // 8 | 8 | BYTE_1
- // 16 | 8 | BYTE_2
- // 16 | 16 | WORD_1
- // 24 | 8 | BYTE_3
-
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- auto Offset = foldToImm(*Src1);
- if (!Offset)
- break;
-
- MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- auto Width = foldToImm(*Src2);
- if (!Width)
- break;
-
- SdwaSel SrcSel = DWORD;
-
- if (*Offset == 0 && *Width == 8)
- SrcSel = BYTE_0;
- else if (*Offset == 0 && *Width == 16)
- SrcSel = WORD_0;
- else if (*Offset == 0 && *Width == 32)
- SrcSel = DWORD;
- else if (*Offset == 8 && *Width == 8)
- SrcSel = BYTE_1;
- else if (*Offset == 16 && *Width == 8)
- SrcSel = BYTE_2;
- else if (*Offset == 16 && *Width == 16)
- SrcSel = WORD_1;
- else if (*Offset == 24 && *Width == 8)
- SrcSel = BYTE_3;
- else
- break;
-
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src0->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- auto SDWASrc = make_unique<SDWASrcOperand>(
- Src0, Dst, SrcSel, false, false,
- Opcode != AMDGPU::V_BFE_U32);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
- ++NumSDWAPatternsFound;
+ case AMDGPU::V_BFE_I32:
+ case AMDGPU::V_BFE_U32: {
+ // e.g.:
+ // from: v_bfe_u32 v1, v0, 8, 8
+ // to SDWA src:v0 src_sel:BYTE_1
+
+ // offset | width | src_sel
+ // ------------------------
+ // 0 | 8 | BYTE_0
+ // 0 | 16 | WORD_0
+ // 0 | 32 | DWORD ?
+ // 8 | 8 | BYTE_1
+ // 16 | 8 | BYTE_2
+ // 16 | 16 | WORD_1
+ // 24 | 8 | BYTE_3
+
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ auto Offset = foldToImm(*Src1);
+ if (!Offset)
+ break;
+
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ auto Width = foldToImm(*Src2);
+ if (!Width)
+ break;
+
+ SdwaSel SrcSel = DWORD;
+
+ if (*Offset == 0 && *Width == 8)
+ SrcSel = BYTE_0;
+ else if (*Offset == 0 && *Width == 16)
+ SrcSel = WORD_0;
+ else if (*Offset == 0 && *Width == 32)
+ SrcSel = DWORD;
+ else if (*Offset == 8 && *Width == 8)
+ SrcSel = BYTE_1;
+ else if (*Offset == 16 && *Width == 8)
+ SrcSel = BYTE_2;
+ else if (*Offset == 16 && *Width == 16)
+ SrcSel = WORD_1;
+ else if (*Offset == 24 && *Width == 8)
+ SrcSel = BYTE_3;
+ else
+ break;
+
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src0->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ return make_unique<SDWASrcOperand>(
+ Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
+ }
+
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64: {
+ // e.g.:
+ // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+ // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ auto ValSrc = Src1;
+ auto Imm = foldToImm(*Src0);
+
+ if (!Imm) {
+ Imm = foldToImm(*Src1);
+ ValSrc = Src0;
+ }
+
+ if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
+ break;
+
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (TRI->isPhysicalRegister(Src1->getReg()) ||
+ TRI->isPhysicalRegister(Dst->getReg()))
+ break;
+
+ return make_unique<SDWASrcOperand>(
+ ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+ }
+
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64: {
+ // Patterns for dst_unused:UNUSED_PRESERVE.
+ // e.g., from:
+ // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
+ // src1_sel:WORD_1 src2_sel:WORD1
+ // v_add_f16_e32 v3, v1, v2
+ // v_or_b32_e32 v4, v0, v3
+ // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
+
+ // Check if one of operands of v_or_b32 is SDWA instruction
+ using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
+ auto CheckOROperandsForSDWA =
+ [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
+ if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
+ return CheckRetType(None);
+
+ MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
+ if (!Op1Def)
+ return CheckRetType(None);
+
+ MachineInstr *Op1Inst = Op1Def->getParent();
+ if (!TII->isSDWA(*Op1Inst))
+ return CheckRetType(None);
+
+ MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
+ if (!Op2Def)
+ return CheckRetType(None);
+
+ return CheckRetType(std::make_pair(Op1Def, Op2Def));
+ };
+
+ MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ assert(OrSDWA && OrOther);
+ auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
+ if (!Res) {
+ OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ assert(OrSDWA && OrOther);
+ Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
+ if (!Res)
break;
- }
- case AMDGPU::V_AND_B32_e32:
- case AMDGPU::V_AND_B32_e64: {
- // e.g.:
- // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
- // to SDWA src:v0 src_sel:WORD_0/BYTE_0
-
- MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- auto ValSrc = Src1;
- auto Imm = foldToImm(*Src0);
-
- if (!Imm) {
- Imm = foldToImm(*Src1);
- ValSrc = Src0;
- }
-
- if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
- break;
-
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
- break;
-
- auto SDWASrc = make_unique<SDWASrcOperand>(
- ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
- DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
- SDWAOperands[&MI] = std::move(SDWASrc);
+ }
+
+ MachineOperand *OrSDWADef = Res->first;
+ MachineOperand *OrOtherDef = Res->second;
+ assert(OrSDWADef && OrOtherDef);
+
+ MachineInstr *SDWAInst = OrSDWADef->getParent();
+ MachineInstr *OtherInst = OrOtherDef->getParent();
+
+ // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
+ // destination patterns don't overlap. Compatible instruction can be either
+ // regular instruction with compatible bitness or SDWA instruction with
+ // correct dst_sel
+ // SDWAInst | OtherInst bitness / OtherInst dst_sel
+ // -----------------------------------------------------
+ // DWORD | no / no
+ // WORD_0 | no / BYTE_2/3, WORD_1
+ // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
+ // BYTE_0 | no / BYTE_1/2/3, WORD_1
+ // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
+ // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
+ // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
+ // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
+ // but v_add_f32 is not.
+
+ // TODO: add support for non-SDWA instructions as OtherInst.
+ // For now this only works with SDWA instructions. For regular instructions
+ // there is no way to determine if instruction write only 8/16/24-bit out of
+ // full register size and all registers are at min 32-bit wide.
+ if (!TII->isSDWA(*OtherInst))
+ break;
+
+ SdwaSel DstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
+ SdwaSel OtherDstSel = static_cast<SdwaSel>(
+ TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
+
+ bool DstSelAgree = false;
+ switch (DstSel) {
+ case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == WORD_0));
+ break;
+ case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_1));
+ break;
+ case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_3) ||
+ (OtherDstSel == WORD_0));
+ break;
+ case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
+ (OtherDstSel == BYTE_1) ||
+ (OtherDstSel == BYTE_2) ||
+ (OtherDstSel == WORD_0));
+ break;
+ default: DstSelAgree = false;
+ }
+
+ if (!DstSelAgree)
+ break;
+
+ // Also OtherInst dst_unused should be UNUSED_PAD
+ DstUnused OtherDstUnused = static_cast<DstUnused>(
+ TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
+ if (OtherDstUnused != DstUnused::UNUSED_PAD)
+ break;
+
+ // Create DstPreserveOperand
+ MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ assert(OrDst && OrDst->isReg());
+
+ return make_unique<SDWADstPreserveOperand>(
+ OrDst, OrSDWADef, OrOtherDef, DstSel);
+
+ }
+ }
+
+ return std::unique_ptr<SDWAOperand>(nullptr);
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (auto Operand = matchSDWAOperand(MI)) {
+ DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
+ SDWAOperands[&MI] = std::move(Operand);
++NumSDWAPatternsFound;
- break;
- }
}
}
}
@@ -627,12 +817,16 @@ void SIPeepholeSDWA::matchSDWAOperands(M
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
const SISubtarget &ST) const {
+ // Check if this is already an SDWA instruction
+ unsigned Opc = MI.getOpcode();
+ if (TII->isSDWA(Opc))
+ return true;
+
// Check if this instruction has opcode that supports SDWA
- int Opc = MI.getOpcode();
if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);
- if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+ if (AMDGPU::getSDWAOp(Opc) == -1)
return false;
if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
@@ -665,9 +859,15 @@ bool SIPeepholeSDWA::isConvertibleToSDWA
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
- int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
- if (SDWAOpcode == -1)
- SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
+ int SDWAOpcode;
+ unsigned Opcode = MI.getOpcode();
+ if (TII->isSDWA(Opcode)) {
+ SDWAOpcode = Opcode;
+ } else {
+ SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
+ if (SDWAOpcode == -1)
+ SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
+ }
assert(SDWAOpcode != -1);
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
@@ -743,25 +943,44 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
}
}
- // Initialize dst_sel if present
+ // Copy dst_sel if present, initialize otherwise if needed
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+ if (DstSel) {
+ SDWAInst.add(*DstSel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
}
- // Initialize dst_unused if present
+ // Copy dst_unused if present, initialize otherwise if needed
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
- SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+ MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+ if (DstUnused) {
+ SDWAInst.add(*DstUnused);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+ }
}
- // Initialize src0_sel
+ // Copy src0_sel if present, initialize otherwise
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
-
+ MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+ if (Src0Sel) {
+ SDWAInst.add(*Src0Sel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
- // Initialize src1_sel if present
+ // Copy src1_sel if present, initialize otherwise if needed
if (Src1) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
- SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+ if (Src1Sel) {
+ SDWAInst.add(*Src1Sel);
+ } else {
+ SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+ }
}
// Apply all sdwa operand pattenrs
@@ -800,7 +1019,7 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
- for (MachineOperand &Op: MI.explicit_uses()) {
+ for (MachineOperand &Op : MI.explicit_uses()) {
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;
@@ -838,27 +1057,35 @@ bool SIPeepholeSDWA::runOnMachineFunctio
TII = ST.getInstrInfo();
// Find all SDWA operands in MF.
- matchSDWAOperands(MF);
+ bool Changed = false;
+ bool Ret = false;
+ do {
+ matchSDWAOperands(MF);
+
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
+ }
+ }
- for (const auto &OperandPair : SDWAOperands) {
- const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
- PotentialMatches[PotentialMI].push_back(Operand.get());
+ for (auto &PotentialPair : PotentialMatches) {
+ MachineInstr &PotentialMI = *PotentialPair.first;
+ convertToSDWA(PotentialMI, PotentialPair.second);
}
- }
- for (auto &PotentialPair : PotentialMatches) {
- MachineInstr &PotentialMI = *PotentialPair.first;
- convertToSDWA(PotentialMI, PotentialPair.second);
- }
+ PotentialMatches.clear();
+ SDWAOperands.clear();
+
+ Changed = !ConvertedInstructions.empty();
- PotentialMatches.clear();
- SDWAOperands.clear();
+ if (Changed)
+ Ret = true;
- bool Ret = !ConvertedInstructions.empty();
- while (!ConvertedInstructions.empty())
- legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ while (!ConvertedInstructions.empty())
+ legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+ } while (Changed);
return Ret;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.f16.ll Mon Dec 4 08:22:32 2017
@@ -127,8 +127,7 @@ define amdgpu_kernel void @fabs_free_v2f
; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
; CI: v_cvt_f16_f32
-; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
-; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll Mon Dec 4 08:22:32 2017
@@ -207,7 +207,7 @@ define amdgpu_kernel void @test_fold_can
}
; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
-; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; VI-NOT: v_and_b32
@@ -246,7 +246,7 @@ define amdgpu_kernel void @v_test_canoni
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
-; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_or_b32
@@ -266,8 +266,7 @@ define amdgpu_kernel void @v_test_canoni
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
-; VI: v_lshrrev_b32_e32 [[FNEGHI:v[0-9]+]], 16, [[FNEG]]
-; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEGHI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]]
; VI-NOT: 0xffff
Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg.f16.ll Mon Dec 4 08:22:32 2017
@@ -116,8 +116,7 @@ define amdgpu_kernel void @fneg_free_v2f
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
-; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
-; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
Modified: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir?rev=319662&r1=319661&r2=319662&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir Mon Dec 4 08:22:32 2017
@@ -148,13 +148,13 @@ body: |
# GCN-LABEL: {{^}}name: vop2_instructions
-# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec
# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit %exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit %exec
Added: llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir?rev=319662&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir Mon Dec 4 08:22:32 2017
@@ -0,0 +1,56 @@
+# RUN: llc -march=amdgcn -mcpu=fiji -start-before=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s
+
+# SDWA-LABEL: {{^}}add_f16_u32_preserve
+
+# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+
+# SDWA: v_mul_f32_sdwa [[RES:v[0-9]+]], [[FIRST]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3
+# SDWA: v_add_f16_sdwa [[RES:v[0-9]+]], [[FIRST]], [[SECOND]] dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 src1_sel:WORD_1
+
+# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[RES]]
+
+---
+name: add_f16_u32_preserve
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vreg_64 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: vgpr_32 }
+ - { id: 9, class: vgpr_32 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+body: |
+ bb.0:
+ liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+
+ %2 = COPY %sgpr30_sgpr31
+ %1 = COPY %vgpr2_vgpr3
+ %0 = COPY %vgpr0_vgpr1
+ %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+ %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+
+ %5 = V_AND_B32_e32 65535, %3, implicit %exec
+ %6 = V_LSHRREV_B32_e64 16, %4, implicit %exec
+ %7 = V_BFE_U32 %3, 8, 8, implicit %exec
+ %8 = V_LSHRREV_B32_e32 24, %4, implicit %exec
+
+ %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit %exec
+ %10 = V_LSHLREV_B16_e64 8, %9, implicit %exec
+ %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit %exec
+ %12 = V_LSHLREV_B32_e64 16, %11, implicit %exec
+
+ %13 = V_OR_B32_e64 %10, %12, implicit %exec
+
+ FLAT_STORE_DWORD %0, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+ %sgpr30_sgpr31 = COPY %2
+ S_SETPC_B64_return %sgpr30_sgpr31
More information about the llvm-commits
mailing list