[llvm] [AMDGPU] Implement vop3p complex pattern optmization for gisel (PR #130234)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 17:16:58 PDT 2025
https://github.com/Shoreshen updated https://github.com/llvm/llvm-project/pull/130234
>From 556f7ff7836e4d884c64bc87bcef80d1687ccf86 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 7 Mar 2025 12:27:45 +0800
Subject: [PATCH 01/19] Implement vop3p complex pattern optmization for gisel
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 381 ++++++++++++++++--
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 3 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 24 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 6 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 36 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll | 12 +-
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 10 +-
llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 11 +-
llvm/test/lit.cfg.py | 2 +-
12 files changed, 400 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 441fb5730a6d8..0dc47b957bdac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4282,30 +4282,346 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(
- Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
- unsigned Mods = 0;
- MachineInstr *MI = MRI.getVRegDef(Src);
+enum srcStatus {
+ IS_SAME,
+ IS_UPPER_HALF,
+ IS_LOWER_HALF,
+ IS_NEG,
+ IS_UPPER_HALF_NEG,
+ IS_LOWER_HALF_NEG,
+ LAST_STAT = IS_LOWER_HALF_NEG
+};
+
+bool isTruncHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_TRUNC);
+ unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ return dstSize * 2 == srcSize;
+}
+
+bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_LSHR);
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned shift = ShiftAmt->Value.getZExtValue();
+ return shift * 2 == srcSize;
+ }
+ return false;
+}
- if (MI->getOpcode() == AMDGPU::G_FNEG &&
- // It's possible to see an f32 fneg here, but unlikely.
- // TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
- Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- Src = MI->getOperand(1).getReg();
- MI = MRI.getVRegDef(Src);
+bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_SHL);
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned shift = ShiftAmt->Value.getZExtValue();
+ return shift * 2 == srcSize;
+ }
+ return false;
+}
+
+bool retOpStat(MachineOperand *Op, int stat,
+ std::pair<MachineOperand *, int> &curr) {
+ if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
+ Op->isCImm() || Op->isFPImm()) {
+ curr = {Op, stat};
+ return true;
+ }
+ return false;
+}
+
+bool calcNextStatus(std::pair<MachineOperand *, int> &curr,
+ const MachineRegisterInfo &MRI) {
+ if (!curr.first->isReg()) {
+ return false;
+ }
+ MachineInstr *MI = nullptr;
+
+ if (!curr.first->isDef()) {
+ // MRI.getVRegDef falls into infinite loop if use define reg
+ MI = MRI.getVRegDef(curr.first->getReg());
+ } else {
+ MI = curr.first->getParent();
+ }
+ if (!MI) {
+ return false;
+ }
+
+ unsigned Opc = MI->getOpcode();
+
+ // Handle general Opc cases
+ switch (Opc) {
+ case AMDGPU::G_BITCAST:
+ case AMDGPU::G_CONSTANT:
+ case AMDGPU::G_FCONSTANT:
+ case AMDGPU::COPY:
+ return retOpStat(&MI->getOperand(1), curr.second, curr);
+ case AMDGPU::G_FNEG:
+ // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX
+ return retOpStat(&MI->getOperand(1),
+ (curr.second + ((LAST_STAT + 1) / 2)) % (LAST_STAT + 1),
+ curr);
+ }
+
+ // Calc next stat from current stat
+ switch (curr.second) {
+ case IS_SAME:
+ switch (Opc) {
+ case AMDGPU::G_TRUNC: {
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_NEG:
+ switch (Opc) {
+ case AMDGPU::G_TRUNC: {
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_UPPER_HALF:
+ switch (Opc) {
+ case AMDGPU::G_SHL: {
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_LOWER_HALF:
+ switch (Opc) {
+ case AMDGPU::G_LSHR: {
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_UPPER_HALF_NEG:
+ switch (Opc) {
+ case AMDGPU::G_SHL: {
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_LOWER_HALF_NEG:
+ switch (Opc) {
+ case AMDGPU::G_LSHR: {
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+std::vector<std::pair<MachineOperand *, int>>
+getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI,
+ bool onlyLastSameOrNeg = false, int maxDepth = 6) {
+ int depth = 0;
+ std::pair<MachineOperand *, int> curr = {Op, IS_SAME};
+ std::vector<std::pair<MachineOperand *, int>> statList;
+
+ while (true) {
+ depth++;
+ if (depth > maxDepth) {
+ break;
+ }
+ bool ret = calcNextStatus(curr, MRI);
+ if (!ret || (onlyLastSameOrNeg &&
+ (curr.second != IS_SAME && curr.second != IS_NEG))) {
+ break;
+ } else if (!onlyLastSameOrNeg) {
+ statList.push_back(curr);
+ }
}
+ if (onlyLastSameOrNeg) {
+ statList.push_back(curr);
+ }
+ return statList;
+}
- // TODO: Handle G_FSUB 0 as fneg
+bool isInlinableConstant(MachineOperand *Op, const SIInstrInfo &TII) {
+ bool a = TII.isInlineConstant(*Op);
+ switch (Op->getType()) {
+ case MachineOperand::MachineOperandType::MO_Immediate:
+ return TII.isInlineConstant(*Op);
+ case MachineOperand::MachineOperandType::MO_CImmediate:
+ return TII.isInlineConstant(Op->getCImm()->getValue());
+ case MachineOperand::MachineOperandType::MO_FPImmediate:
+ return TII.isInlineConstant(Op->getFPImm()->getValueAPF());
+ }
+ return false;
+}
- // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
- (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
+bool isSameBitWidth(MachineOperand *Op1, MachineOperand *Op2,
+ const MachineRegisterInfo &MRI) {
+ unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits();
+ unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits();
+ return width1 == width2;
+}
+bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) {
+ if (Op1->isReg()) {
+ if (Op2->isReg()) {
+ return Op1->getReg() == Op2->getReg();
+ }
+ return false;
+ }
+ return Op1->isIdenticalTo(*Op2);
+}
+
+bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
+ MachineOperand *newOp, MachineOperand *RootOp,
+ const SIInstrInfo &TII, const MachineRegisterInfo &MRI) {
+ if (newOp->isReg()) {
+ if (isSameBitWidth(newOp, RootOp, MRI)) {
+ // IS_LOWER_HALF remain 0
+ if (HiStat == IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == IS_UPPER_HALF) {
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == IS_LOWER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ }
+ if (LoStat == IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG;
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == IS_UPPER_HALF) {
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == IS_UPPER_HALF_NEG) {
+ Mods |= SISrcMods::NEG;
+ }
+ return true;
+ }
+ } else {
+ if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
+ (LoStat == IS_SAME || LoStat == IS_NEG) &&
+ isInlinableConstant(newOp, TII)) {
+ if (HiStat == IS_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ }
+ if (LoStat == IS_NEG) {
+ Mods ^= SISrcMods::NEG;
+ }
+ // opsel = opsel_hi = 0, since the upper half and lower half both
+ // the same as the target inlinable constant
+ return true;
+ }
+ }
+ return false;
+}
+
+std::pair<MachineOperand *, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op,
+ const MachineRegisterInfo &MRI,
+ bool IsDOT) const {
+ unsigned Mods = 0;
+ MachineOperand *RootOp = Op;
+ std::pair<MachineOperand *, int> stat = getSrcStats(Op, MRI, true)[0];
+ if (!stat.first->isReg()) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+ if (stat.second == IS_NEG) {
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ }
+ Op = stat.first;
+ MachineInstr *MI = MRI.getVRegDef(Op->getReg());
+ if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
+ (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
+ std::vector<std::pair<MachineOperand *, int>> statList_Hi;
+ std::vector<std::pair<MachineOperand *, int>> statList_Lo;
+ statList_Hi = getSrcStats(&MI->getOperand(2), MRI);
+ if (statList_Hi.size() != 0) {
+ statList_Lo = getSrcStats(&MI->getOperand(1), MRI);
+ if (statList_Lo.size() != 0) {
+ for (int i = statList_Hi.size() - 1; i >= 0; i--) {
+ for (int j = statList_Lo.size() - 1; j >= 0; j--) {
+ if (isSameOperand(statList_Hi[i].first, statList_Lo[j].first)) {
+ if (validToPack(statList_Hi[i].second, statList_Lo[j].second,
+ Mods, statList_Hi[i].first, RootOp, TII, MRI)) {
+ return {statList_Hi[i].first, Mods};
+ }
+ }
+ }
+ }
+ }
+ }
+ }
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
- return std::pair(Src, Mods);
+ return {Op, Mods};
+}
+
+int64_t getAllKindImm(MachineOperand *Op) {
+ switch (Op->getType()) {
+ case MachineOperand::MachineOperandType::MO_Immediate:
+ return Op->getImm();
+ case MachineOperand::MachineOperandType::MO_CImmediate:
+ return Op->getCImm()->getSExtValue();
+ break;
+ case MachineOperand::MachineOperandType::MO_FPImmediate:
+ return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
+ break;
+ }
+ llvm_unreachable("not an imm type");
+}
+
+bool checkRB(MachineOperand *Op, int RBNo, const AMDGPURegisterBankInfo &RBI,
+ const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) {
+ const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI);
+ return RB->getID() == RBNo;
+}
+
+MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp,
+ const AMDGPURegisterBankInfo &RBI,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
+ // RootOp can only be VGPR or SGPR (some hand written cases such as
+ // inst-select-ashr.v2s16.mir::ashr_v2s16_vs)
+ if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
+ checkRB(newOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) {
+ return newOp;
+ }
+ MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
+ if (MI->getOpcode() == AMDGPU::COPY &&
+ isSameOperand(newOp, &MI->getOperand(1))) {
+ // RootOp is VGPR, newOp is not VGPR, but RootOp = COPY newOp
+ return RootOp;
+ }
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(*RootOp, MRI);
+ Register dstReg = MRI.createVirtualRegister(DstRC);
+
+ MachineIRBuilder B(*RootOp->getParent());
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::COPY).addDef(dstReg).addUse(newOp->getReg());
+
+ // only accept VGPR
+ return &MIB->getOperand(0);
}
InstructionSelector::ComplexRendererFns
@@ -4313,13 +4629,17 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- Register Src;
- unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
-
+ std::pair<MachineOperand *, unsigned> res = selectVOP3PModsImpl(&Root, MRI);
+ if (!(res.first->isReg())) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ }};
+ }
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
@@ -4328,13 +4648,18 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- Register Src;
- unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
-
+ std::pair<MachineOperand *, unsigned> res =
+ selectVOP3PModsImpl(&Root, MRI, true);
+ if (!(res.first->isReg())) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ }};
+ }
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index cc7552868a056..2af4f55403acc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
- std::pair<Register, unsigned>
- selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
+ std::pair<MachineOperand *, unsigned>
+ selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 1d9514c58ab9c..2243c57cf37ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -68,8 +68,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index e2dab03e410aa..7d6cfac52714e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -248,8 +248,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_sdot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegf32_c:
@@ -263,8 +262,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX10-LABEL: v_sdot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
@@ -276,8 +274,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
@@ -291,8 +288,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -304,8 +300,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_a:
@@ -319,8 +314,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -331,8 +325,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_b:
@@ -346,8 +339,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 06560afee3c9a..d6ef48e25cafb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -91,8 +91,7 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegf32_a:
@@ -112,8 +111,7 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegv2f16_a:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 0d729351f65a7..d2aa47df81cbe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -47,15 +47,13 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -67,15 +65,13 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 3acff52874dd9..347644826fd0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -235,22 +235,19 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_udot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
@@ -262,22 +259,19 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_udot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegv2f16_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -289,22 +283,19 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -315,22 +306,19 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index b14af9e043e09..7ad0404942feb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -112,15 +112,13 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot4_fnegf32_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -132,15 +130,13 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index a664c8aa508ef..52763bbc24e40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -48,15 +48,13 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot8_fnegf32_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -68,15 +66,13 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9b03a72fd826d..0577ba9b233be 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}}
+; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}}
+; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -556,8 +556,8 @@ bb:
; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
+; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 3420596da2aac..c6349bcbcdbf1 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -477,9 +477,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-GISEL-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -519,8 +518,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-GISEL-NEXT: ; return to shader part epilog
;
; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -535,8 +533,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-GISEL: ; %bb.0:
-; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x half> %val
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 58464a3a24d90211f784f54f816095378a7569dc Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 7 Mar 2025 12:37:51 +0800
Subject: [PATCH 02/19] fix lit file
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From daae1aeaefe49f5cbb14facf8c4535e431ab741a Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 10 Mar 2025 14:16:30 +0800
Subject: [PATCH 03/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 98 +++++++++----------
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 +-
2 files changed, 50 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0dc47b957bdac..00d538f55a3cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4292,14 +4292,15 @@ enum srcStatus {
LAST_STAT = IS_LOWER_HALF_NEG
};
-bool isTruncHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+static bool isTruncHalf(const MachineInstr *MI,
+ const MachineRegisterInfo &MRI) {
assert(MI->getOpcode() == AMDGPU::G_TRUNC);
unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
return dstSize * 2 == srcSize;
}
-bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
assert(MI->getOpcode() == AMDGPU::G_LSHR);
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
@@ -4312,7 +4313,7 @@ bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
return false;
}
-bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
assert(MI->getOpcode() == AMDGPU::G_SHL);
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
@@ -4325,8 +4326,8 @@ bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
return false;
}
-bool retOpStat(MachineOperand *Op, int stat,
- std::pair<MachineOperand *, int> &curr) {
+static bool retOpStat(const MachineOperand *Op, int stat,
+ std::pair<const MachineOperand *, int> &curr) {
if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
Op->isCImm() || Op->isFPImm()) {
curr = {Op, stat};
@@ -4335,15 +4336,14 @@ bool retOpStat(MachineOperand *Op, int stat,
return false;
}
-bool calcNextStatus(std::pair<MachineOperand *, int> &curr,
- const MachineRegisterInfo &MRI) {
+static bool calcNextStatus(std::pair<const MachineOperand *, int> &curr,
+ const MachineRegisterInfo &MRI) {
if (!curr.first->isReg()) {
return false;
}
- MachineInstr *MI = nullptr;
+ const MachineInstr *MI = nullptr;
if (!curr.first->isDef()) {
- // MRI.getVRegDef falls into infinite loop if use define reg
MI = MRI.getVRegDef(curr.first->getReg());
} else {
MI = curr.first->getParent();
@@ -4434,12 +4434,12 @@ bool calcNextStatus(std::pair<MachineOperand *, int> &curr,
return false;
}
-std::vector<std::pair<MachineOperand *, int>>
-getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI,
+SmallVector<std::pair<const MachineOperand *, int>>
+getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool onlyLastSameOrNeg = false, int maxDepth = 6) {
int depth = 0;
- std::pair<MachineOperand *, int> curr = {Op, IS_SAME};
- std::vector<std::pair<MachineOperand *, int>> statList;
+ std::pair<const MachineOperand *, int> curr = {Op, IS_SAME};
+ SmallVector<std::pair<const MachineOperand *, int>> statList;
while (true) {
depth++;
@@ -4460,27 +4460,23 @@ getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI,
return statList;
}
-bool isInlinableConstant(MachineOperand *Op, const SIInstrInfo &TII) {
- bool a = TII.isInlineConstant(*Op);
- switch (Op->getType()) {
- case MachineOperand::MachineOperandType::MO_Immediate:
- return TII.isInlineConstant(*Op);
- case MachineOperand::MachineOperandType::MO_CImmediate:
- return TII.isInlineConstant(Op->getCImm()->getValue());
- case MachineOperand::MachineOperandType::MO_FPImmediate:
- return TII.isInlineConstant(Op->getFPImm()->getValueAPF());
+static bool isInlinableConstant(const MachineOperand &Op,
+ const SIInstrInfo &TII) {
+ if (Op.isFPImm()) {
+ return TII.isInlineConstant(Op.getFPImm()->getValueAPF());
}
return false;
}
-bool isSameBitWidth(MachineOperand *Op1, MachineOperand *Op2,
- const MachineRegisterInfo &MRI) {
+static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
+ const MachineRegisterInfo &MRI) {
unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits();
unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits();
return width1 == width2;
}
-bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) {
+static bool isSameOperand(const MachineOperand *Op1,
+ const MachineOperand *Op2) {
if (Op1->isReg()) {
if (Op2->isReg()) {
return Op1->getReg() == Op2->getReg();
@@ -4490,9 +4486,10 @@ bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) {
return Op1->isIdenticalTo(*Op2);
}
-bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
- MachineOperand *newOp, MachineOperand *RootOp,
- const SIInstrInfo &TII, const MachineRegisterInfo &MRI) {
+static bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
+ const MachineOperand *newOp,
+ const MachineOperand *RootOp, const SIInstrInfo &TII,
+ const MachineRegisterInfo &MRI) {
if (newOp->isReg()) {
if (isSameBitWidth(newOp, RootOp, MRI)) {
// IS_LOWER_HALF remain 0
@@ -4517,7 +4514,7 @@ bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
} else {
if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
(LoStat == IS_SAME || LoStat == IS_NEG) &&
- isInlinableConstant(newOp, TII)) {
+ isInlinableConstant(*newOp, TII)) {
if (HiStat == IS_NEG) {
Mods ^= SISrcMods::NEG_HI;
}
@@ -4532,13 +4529,13 @@ bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
return false;
}
-std::pair<MachineOperand *, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op,
+std::pair<const MachineOperand *, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
const MachineRegisterInfo &MRI,
bool IsDOT) const {
unsigned Mods = 0;
- MachineOperand *RootOp = Op;
- std::pair<MachineOperand *, int> stat = getSrcStats(Op, MRI, true)[0];
+ const MachineOperand *RootOp = Op;
+ std::pair<const MachineOperand *, int> stat = getSrcStats(Op, MRI, true)[0];
if (!stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
@@ -4550,8 +4547,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op,
MachineInstr *MI = MRI.getVRegDef(Op->getReg());
if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
(!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
- std::vector<std::pair<MachineOperand *, int>> statList_Hi;
- std::vector<std::pair<MachineOperand *, int>> statList_Lo;
+ SmallVector<std::pair<const MachineOperand *, int>> statList_Hi;
+ SmallVector<std::pair<const MachineOperand *, int>> statList_Lo;
statList_Hi = getSrcStats(&MI->getOperand(2), MRI);
if (statList_Hi.size() != 0) {
statList_Lo = getSrcStats(&MI->getOperand(1), MRI);
@@ -4575,30 +4572,29 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op,
return {Op, Mods};
}
-int64_t getAllKindImm(MachineOperand *Op) {
+int64_t getAllKindImm(const MachineOperand *Op) {
switch (Op->getType()) {
case MachineOperand::MachineOperandType::MO_Immediate:
return Op->getImm();
case MachineOperand::MachineOperandType::MO_CImmediate:
return Op->getCImm()->getSExtValue();
- break;
case MachineOperand::MachineOperandType::MO_FPImmediate:
return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
- break;
}
llvm_unreachable("not an imm type");
}
-bool checkRB(MachineOperand *Op, int RBNo, const AMDGPURegisterBankInfo &RBI,
- const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) {
+bool checkRB(const MachineOperand *Op, int RBNo,
+ const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI);
return RB->getID() == RBNo;
}
-MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp,
- const AMDGPURegisterBankInfo &RBI,
- MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) {
+const MachineOperand *
+getVReg(const MachineOperand *newOp, const MachineOperand *RootOp,
+ const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
// RootOp can only be VGPR or SGPR (some hand written cases such as
// inst-select-ashr.v2s16.mir::ashr_v2s16_vs)
if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
@@ -4612,13 +4608,14 @@ MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp,
return RootOp;
}
+ MachineBasicBlock *BB = MI->getParent();
const TargetRegisterClass *DstRC =
TRI.getConstrainedRegClassForOperand(*RootOp, MRI);
Register dstReg = MRI.createVirtualRegister(DstRC);
- MachineIRBuilder B(*RootOp->getParent());
MachineInstrBuilder MIB =
- B.buildInstr(AMDGPU::COPY).addDef(dstReg).addUse(newOp->getReg());
+ BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg)
+ .addReg(newOp->getReg());
// only accept VGPR
return &MIB->getOperand(0);
@@ -4629,14 +4626,15 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<MachineOperand *, unsigned> res = selectVOP3PModsImpl(&Root, MRI);
+ std::pair<const MachineOperand *, unsigned> res =
+ selectVOP3PModsImpl(&Root, MRI);
if (!(res.first->isReg())) {
return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
- res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
@@ -4648,7 +4646,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<MachineOperand *, unsigned> res =
+ std::pair<const MachineOperand *, unsigned> res =
selectVOP3PModsImpl(&Root, MRI, true);
if (!(res.first->isReg())) {
return {{
@@ -4656,7 +4654,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
- res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 2af4f55403acc..dd172edfdf216 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
- std::pair<MachineOperand *, unsigned>
- selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI,
+ std::pair<const MachineOperand *, unsigned>
+ selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
>From 2e587f5fbcc23f6574c4f6f7b86974f0c6352ca4 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 12 Mar 2025 11:18:17 +0800
Subject: [PATCH 04/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 122 ++++++++----------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 58 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 00d538f55a3cf..622b1bd3f5bf5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4289,19 +4289,23 @@ enum srcStatus {
IS_NEG,
IS_UPPER_HALF_NEG,
IS_LOWER_HALF_NEG,
- LAST_STAT = IS_LOWER_HALF_NEG
+ INVALID
};
static bool isTruncHalf(const MachineInstr *MI,
const MachineRegisterInfo &MRI) {
- assert(MI->getOpcode() == AMDGPU::G_TRUNC);
+ if (MI->getOpcode() != AMDGPU::G_TRUNC) {
+ return false;
+ }
unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
return dstSize * 2 == srcSize;
}
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- assert(MI->getOpcode() == AMDGPU::G_LSHR);
+ if (MI->getOpcode() != AMDGPU::G_LSHR) {
+ return false;
+ }
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(MI->getOperand(0).getReg(), MRI,
@@ -4314,7 +4318,9 @@ static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
}
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- assert(MI->getOpcode() == AMDGPU::G_SHL);
+ if (MI->getOpcode() != AMDGPU::G_SHL) {
+ return false;
+ }
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(MI->getOperand(0).getReg(), MRI,
@@ -4326,8 +4332,11 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
return false;
}
-static bool retOpStat(const MachineOperand *Op, int stat,
- std::pair<const MachineOperand *, int> &curr) {
+static bool retOpStat(const MachineOperand *Op, srcStatus stat,
+ std::pair<const MachineOperand *, srcStatus> &curr) {
+ if (stat == INVALID) {
+ return false;
+ }
if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
Op->isCImm() || Op->isFPImm()) {
curr = {Op, stat};
@@ -4336,7 +4345,25 @@ static bool retOpStat(const MachineOperand *Op, int stat,
return false;
}
-static bool calcNextStatus(std::pair<const MachineOperand *, int> &curr,
+srcStatus getNegStatus(srcStatus S) {
+ switch (S) {
+ case IS_SAME:
+ return IS_NEG;
+ case IS_UPPER_HALF:
+ return IS_UPPER_HALF_NEG;
+ case IS_LOWER_HALF:
+ return IS_LOWER_HALF_NEG;
+ case IS_NEG:
+ return IS_SAME;
+ case IS_UPPER_HALF_NEG:
+ return IS_UPPER_HALF;
+ case IS_LOWER_HALF_NEG:
+ return IS_LOWER_HALF;
+ }
+ return INVALID;
+}
+
+static bool calcNextStatus(std::pair<const MachineOperand *, srcStatus> &curr,
const MachineRegisterInfo &MRI) {
if (!curr.first->isReg()) {
return false;
@@ -4363,92 +4390,56 @@ static bool calcNextStatus(std::pair<const MachineOperand *, int> &curr,
return retOpStat(&MI->getOperand(1), curr.second, curr);
case AMDGPU::G_FNEG:
// XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX
- return retOpStat(&MI->getOperand(1),
- (curr.second + ((LAST_STAT + 1) / 2)) % (LAST_STAT + 1),
- curr);
+ return retOpStat(&MI->getOperand(1), getNegStatus(curr.second), curr);
}
// Calc next stat from current stat
switch (curr.second) {
case IS_SAME:
- switch (Opc) {
- case AMDGPU::G_TRUNC: {
- if (isTruncHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
- }
- break;
- }
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
}
break;
case IS_NEG:
- switch (Opc) {
- case AMDGPU::G_TRUNC: {
- if (isTruncHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
- }
- break;
- }
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
}
break;
case IS_UPPER_HALF:
- switch (Opc) {
- case AMDGPU::G_SHL: {
- if (isShlHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
- }
- break;
- }
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
}
break;
case IS_LOWER_HALF:
- switch (Opc) {
- case AMDGPU::G_LSHR: {
- if (isLshrHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr);
- }
- break;
- }
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr);
}
break;
case IS_UPPER_HALF_NEG:
- switch (Opc) {
- case AMDGPU::G_SHL: {
- if (isShlHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
- }
- break;
- }
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
}
break;
case IS_LOWER_HALF_NEG:
- switch (Opc) {
- case AMDGPU::G_LSHR: {
- if (isLshrHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr);
- }
- break;
- }
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr);
}
break;
}
return false;
}
-SmallVector<std::pair<const MachineOperand *, int>>
+SmallVector<std::pair<const MachineOperand *, srcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool onlyLastSameOrNeg = false, int maxDepth = 6) {
int depth = 0;
- std::pair<const MachineOperand *, int> curr = {Op, IS_SAME};
- SmallVector<std::pair<const MachineOperand *, int>> statList;
+ std::pair<const MachineOperand *, srcStatus> curr = {Op, IS_SAME};
+ SmallVector<std::pair<const MachineOperand *, srcStatus>> statList;
- while (true) {
+ while (depth <= maxDepth && calcNextStatus(curr, MRI)) {
depth++;
- if (depth > maxDepth) {
- break;
- }
- bool ret = calcNextStatus(curr, MRI);
- if (!ret || (onlyLastSameOrNeg &&
- (curr.second != IS_SAME && curr.second != IS_NEG))) {
+ if ((onlyLastSameOrNeg &&
+ (curr.second != IS_SAME && curr.second != IS_NEG))) {
break;
} else if (!onlyLastSameOrNeg) {
statList.push_back(curr);
@@ -4535,7 +4526,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
bool IsDOT) const {
unsigned Mods = 0;
const MachineOperand *RootOp = Op;
- std::pair<const MachineOperand *, int> stat = getSrcStats(Op, MRI, true)[0];
+ std::pair<const MachineOperand *, srcStatus> stat =
+ getSrcStats(Op, MRI, true)[0];
if (!stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
@@ -4547,8 +4539,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
MachineInstr *MI = MRI.getVRegDef(Op->getReg());
if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
(!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
- SmallVector<std::pair<const MachineOperand *, int>> statList_Hi;
- SmallVector<std::pair<const MachineOperand *, int>> statList_Lo;
+ SmallVector<std::pair<const MachineOperand *, srcStatus>> statList_Hi;
+ SmallVector<std::pair<const MachineOperand *, srcStatus>> statList_Lo;
statList_Hi = getSrcStats(&MI->getOperand(2), MRI);
if (statList_Hi.size() != 0) {
statList_Lo = getSrcStats(&MI->getOperand(1), MRI);
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From c6c4b3e92063adae31e67fbad6d64f6f77f71324 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 12 Mar 2025 16:39:19 +0800
Subject: [PATCH 05/19] fix comments
---
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 10 +++-------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 622b1bd3f5bf5..59ccb1b7ed236 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4288,8 +4288,7 @@ enum srcStatus {
IS_LOWER_HALF,
IS_NEG,
IS_UPPER_HALF_NEG,
- IS_LOWER_HALF_NEG,
- INVALID
+ IS_LOWER_HALF_NEG
};
static bool isTruncHalf(const MachineInstr *MI,
@@ -4334,9 +4333,6 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
static bool retOpStat(const MachineOperand *Op, srcStatus stat,
std::pair<const MachineOperand *, srcStatus> &curr) {
- if (stat == INVALID) {
- return false;
- }
if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
Op->isCImm() || Op->isFPImm()) {
curr = {Op, stat};
@@ -4360,7 +4356,7 @@ srcStatus getNegStatus(srcStatus S) {
case IS_LOWER_HALF_NEG:
return IS_LOWER_HALF;
}
- return INVALID;
+ llvm_unreachable("unexpected srcStatus");
}
static bool calcNextStatus(std::pair<const MachineOperand *, srcStatus> &curr,
@@ -4477,7 +4473,7 @@ static bool isSameOperand(const MachineOperand *Op1,
return Op1->isIdenticalTo(*Op2);
}
-static bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
+static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods,
const MachineOperand *newOp,
const MachineOperand *RootOp, const SIInstrInfo &TII,
const MachineRegisterInfo &MRI) {
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 6378180cc336aef22a0c256f26321c4cecedae24 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 17 Mar 2025 11:10:42 +0800
Subject: [PATCH 06/19] fix comments and test case
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 216 +++++++++---------
llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 3 +-
2 files changed, 105 insertions(+), 114 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 58355c6ee7f43..6ffb5bc4a788a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4293,7 +4293,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-enum srcStatus {
+enum SrcStatus {
IS_SAME,
IS_UPPER_HALF,
IS_LOWER_HALF,
@@ -4304,55 +4304,55 @@ enum srcStatus {
static bool isTruncHalf(const MachineInstr *MI,
const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_TRUNC) {
+ if (MI->getOpcode() != AMDGPU::G_TRUNC)
return false;
- }
- unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
- unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- return dstSize * 2 == srcSize;
+
+ unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ return DstSize * 2 == SrcSize;
}
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_LSHR) {
+ if (MI->getOpcode() != AMDGPU::G_LSHR)
return false;
- }
+
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(MI->getOperand(0).getReg(), MRI,
m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
- unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- unsigned shift = ShiftAmt->Value.getZExtValue();
- return shift * 2 == srcSize;
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned Shift = ShiftAmt->Value.getZExtValue();
+ return Shift * 2 == SrcSize;
}
return false;
}
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_SHL) {
+ if (MI->getOpcode() != AMDGPU::G_SHL)
return false;
- }
+
Register ShiftSrc;
std::optional<ValueAndVReg> ShiftAmt;
if (mi_match(MI->getOperand(0).getReg(), MRI,
m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
- unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- unsigned shift = ShiftAmt->Value.getZExtValue();
- return shift * 2 == srcSize;
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned Shift = ShiftAmt->Value.getZExtValue();
+ return Shift * 2 == SrcSize;
}
return false;
}
-static bool retOpStat(const MachineOperand *Op, srcStatus stat,
- std::pair<const MachineOperand *, srcStatus> &curr) {
+static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
+ std::pair<const MachineOperand *, SrcStatus> &Curr) {
if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
- Op->isCImm() || Op->isFPImm()) {
- curr = {Op, stat};
- return true;
- }
+ Op->isCImm() || Op->isFPImm())
+ Curr = {Op, Stat};
+ return true;
+
return false;
}
-srcStatus getNegStatus(srcStatus S) {
+SrcStatus getNegStatus(SrcStatus S) {
switch (S) {
case IS_SAME:
return IS_NEG;
@@ -4367,24 +4367,23 @@ srcStatus getNegStatus(srcStatus S) {
case IS_LOWER_HALF_NEG:
return IS_LOWER_HALF;
}
- llvm_unreachable("unexpected srcStatus");
+ llvm_unreachable("unexpected SrcStatus");
}
-static bool calcNextStatus(std::pair<const MachineOperand *, srcStatus> &curr,
+static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
const MachineRegisterInfo &MRI) {
- if (!curr.first->isReg()) {
+ if (!Curr.first->isReg())
return false;
- }
+
const MachineInstr *MI = nullptr;
- if (!curr.first->isDef()) {
- MI = MRI.getVRegDef(curr.first->getReg());
+ if (!Curr.first->isDef()) {
+ MI = MRI.getVRegDef(Curr.first->getReg());
} else {
- MI = curr.first->getParent();
+ MI = Curr.first->getParent();
}
- if (!MI) {
+ if (!MI)
return false;
- }
unsigned Opc = MI->getOpcode();
@@ -4394,83 +4393,77 @@ static bool calcNextStatus(std::pair<const MachineOperand *, srcStatus> &curr,
case AMDGPU::G_CONSTANT:
case AMDGPU::G_FCONSTANT:
case AMDGPU::COPY:
- return retOpStat(&MI->getOperand(1), curr.second, curr);
+ return retOpStat(&MI->getOperand(1), Curr.second, Curr);
case AMDGPU::G_FNEG:
// XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX
- return retOpStat(&MI->getOperand(1), getNegStatus(curr.second), curr);
+ return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr);
}
- // Calc next stat from current stat
- switch (curr.second) {
+ // Calc next Stat from current Stat
+ switch (Curr.second) {
case IS_SAME:
- if (isTruncHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
- }
+ if (isTruncHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr);
break;
case IS_NEG:
- if (isTruncHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
- }
+ if (isTruncHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr);
break;
case IS_UPPER_HALF:
- if (isShlHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
- }
+ if (isShlHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr);
break;
case IS_LOWER_HALF:
- if (isLshrHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr);
- }
+ if (isLshrHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, Curr);
break;
case IS_UPPER_HALF_NEG:
- if (isShlHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
- }
+ if (isShlHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr);
break;
case IS_LOWER_HALF_NEG:
- if (isLshrHalf(MI, MRI)) {
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr);
- }
+ if (isLshrHalf(MI, MRI))
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, Curr);
break;
}
return false;
}
-SmallVector<std::pair<const MachineOperand *, srcStatus>>
+SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool onlyLastSameOrNeg = false, int maxDepth = 6) {
int depth = 0;
- std::pair<const MachineOperand *, srcStatus> curr = {Op, IS_SAME};
- SmallVector<std::pair<const MachineOperand *, srcStatus>> statList;
+ std::pair<const MachineOperand *, SrcStatus> Curr = {Op, IS_SAME};
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist;
- while (depth <= maxDepth && calcNextStatus(curr, MRI)) {
+ while (depth <= maxDepth && calcNextStatus(Curr, MRI)) {
depth++;
if ((onlyLastSameOrNeg &&
- (curr.second != IS_SAME && curr.second != IS_NEG))) {
+ (Curr.second != IS_SAME && Curr.second != IS_NEG))) {
break;
} else if (!onlyLastSameOrNeg) {
- statList.push_back(curr);
+ Statlist.push_back(Curr);
}
}
if (onlyLastSameOrNeg) {
- statList.push_back(curr);
+ Statlist.push_back(Curr);
}
- return statList;
+ return Statlist;
}
static bool isInlinableConstant(const MachineOperand &Op,
const SIInstrInfo &TII) {
- if (Op.isFPImm()) {
+ if (Op.isFPImm())
return TII.isInlineConstant(Op.getFPImm()->getValueAPF());
- }
+
return false;
}
static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
const MachineRegisterInfo &MRI) {
- unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits();
- unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits();
- return width1 == width2;
+ unsigned Width1 = MRI.getType(Op1->getReg()).getSizeInBits();
+ unsigned Width2 = MRI.getType(Op2->getReg()).getSizeInBits();
+ return Width1 == Width2;
}
static bool isSameOperand(const MachineOperand *Op1,
@@ -4484,12 +4477,12 @@ static bool isSameOperand(const MachineOperand *Op1,
return Op1->isIdenticalTo(*Op2);
}
-static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods,
- const MachineOperand *newOp,
+static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods,
+ const MachineOperand *NewOp,
const MachineOperand *RootOp, const SIInstrInfo &TII,
const MachineRegisterInfo &MRI) {
- if (newOp->isReg()) {
- if (isSameBitWidth(newOp, RootOp, MRI)) {
+ if (NewOp->isReg()) {
+ if (isSameBitWidth(NewOp, RootOp, MRI)) {
// IS_LOWER_HALF remain 0
if (HiStat == IS_UPPER_HALF_NEG) {
Mods ^= SISrcMods::NEG_HI;
@@ -4512,7 +4505,7 @@ static bool validToPack(srcStatus HiStat, srcStatus LoStat, unsigned int &Mods,
} else {
if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
(LoStat == IS_SAME || LoStat == IS_NEG) &&
- isInlinableConstant(*newOp, TII)) {
+ isInlinableConstant(*NewOp, TII)) {
if (HiStat == IS_NEG) {
Mods ^= SISrcMods::NEG_HI;
}
@@ -4533,31 +4526,31 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
bool IsDOT) const {
unsigned Mods = 0;
const MachineOperand *RootOp = Op;
- std::pair<const MachineOperand *, srcStatus> stat =
+ std::pair<const MachineOperand *, SrcStatus> Stat =
getSrcStats(Op, MRI, true)[0];
- if (!stat.first->isReg()) {
+ if (!Stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- if (stat.second == IS_NEG) {
+ if (Stat.second == IS_NEG) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
}
- Op = stat.first;
+ Op = Stat.first;
MachineInstr *MI = MRI.getVRegDef(Op->getReg());
if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
(!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
- SmallVector<std::pair<const MachineOperand *, srcStatus>> statList_Hi;
- SmallVector<std::pair<const MachineOperand *, srcStatus>> statList_Lo;
- statList_Hi = getSrcStats(&MI->getOperand(2), MRI);
- if (statList_Hi.size() != 0) {
- statList_Lo = getSrcStats(&MI->getOperand(1), MRI);
- if (statList_Lo.size() != 0) {
- for (int i = statList_Hi.size() - 1; i >= 0; i--) {
- for (int j = statList_Lo.size() - 1; j >= 0; j--) {
- if (isSameOperand(statList_Hi[i].first, statList_Lo[j].first)) {
- if (validToPack(statList_Hi[i].second, statList_Lo[j].second,
- Mods, statList_Hi[i].first, RootOp, TII, MRI)) {
- return {statList_Hi[i].first, Mods};
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Hi;
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Lo;
+ Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI);
+ if (Statlist_Hi.size() != 0) {
+ Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI);
+ if (Statlist_Lo.size() != 0) {
+ for (int i = Statlist_Hi.size() - 1; i >= 0; i--) {
+ for (int j = Statlist_Lo.size() - 1; j >= 0; j--) {
+ if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) {
+ if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second,
+ Mods, Statlist_Hi[i].first, RootOp, TII, MRI)) {
+ return {Statlist_Hi[i].first, Mods};
}
}
}
@@ -4591,21 +4584,20 @@ bool checkRB(const MachineOperand *Op, int RBNo,
}
const MachineOperand *
-getVReg(const MachineOperand *newOp, const MachineOperand *RootOp,
+getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
// RootOp can only be VGPR or SGPR (some hand written cases such as
// inst-select-ashr.v2s16.mir::ashr_v2s16_vs)
if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
- checkRB(newOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) {
- return newOp;
- }
+ checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
+ return NewOp;
+
MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
if (MI->getOpcode() == AMDGPU::COPY &&
- isSameOperand(newOp, &MI->getOperand(1))) {
- // RootOp is VGPR, newOp is not VGPR, but RootOp = COPY newOp
+ isSameOperand(NewOp, &MI->getOperand(1)))
+ // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp
return RootOp;
- }
MachineBasicBlock *BB = MI->getParent();
const TargetRegisterClass *DstRC =
@@ -4614,7 +4606,7 @@ getVReg(const MachineOperand *newOp, const MachineOperand *RootOp,
MachineInstrBuilder MIB =
BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg)
- .addReg(newOp->getReg());
+ .addReg(NewOp->getReg());
// only accept VGPR
return &MIB->getOperand(0);
@@ -4625,18 +4617,18 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<const MachineOperand *, unsigned> res =
+ std::pair<const MachineOperand *, unsigned> Res =
selectVOP3PModsImpl(&Root, MRI);
- if (!(res.first->isReg())) {
+ if (!(Res.first->isReg()))
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
}};
- }
- res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII);
+
+ Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
}};
}
@@ -4645,18 +4637,18 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<const MachineOperand *, unsigned> res =
+ std::pair<const MachineOperand *, unsigned> Res =
selectVOP3PModsImpl(&Root, MRI, true);
- if (!(res.first->isReg())) {
+ if (!(Res.first->isReg()))
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
}};
- }
- res.first = getVReg(res.first, &Root, RBI, MRI, TRI, TII);
+
+ Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
}};
}
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index b561fa86f452c..c766ad2c418fd 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -824,8 +824,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-GISEL-NEXT: ; return to shader part epilog
; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-SDAG: ; %bb.0:
>From b0feaff09679e869330f818be2efb550483aaeb2 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 09:37:06 +0800
Subject: [PATCH 07/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 63 ++++++++++---------
1 file changed, 35 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 6ffb5bc4a788a..51735cf8daff4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4445,9 +4445,8 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
Statlist.push_back(Curr);
}
}
- if (onlyLastSameOrNeg) {
+ if (onlyLastSameOrNeg)
Statlist.push_back(Curr);
- }
return Statlist;
}
@@ -4469,9 +4468,8 @@ static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
static bool isSameOperand(const MachineOperand *Op1,
const MachineOperand *Op2) {
if (Op1->isReg()) {
- if (Op2->isReg()) {
+ if (Op2->isReg())
return Op1->getReg() == Op2->getReg();
- }
return false;
}
return Op1->isIdenticalTo(*Op2);
@@ -4506,12 +4504,10 @@ static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods,
if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
(LoStat == IS_SAME || LoStat == IS_NEG) &&
isInlinableConstant(*NewOp, TII)) {
- if (HiStat == IS_NEG) {
+ if (HiStat == IS_NEG)
Mods ^= SISrcMods::NEG_HI;
- }
- if (LoStat == IS_NEG) {
+ if (LoStat == IS_NEG)
Mods ^= SISrcMods::NEG;
- }
// opsel = opsel_hi = 0, since the upper half and lower half both
// the same as the target inlinable constant
return true;
@@ -4532,29 +4528,40 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- if (Stat.second == IS_NEG) {
+ if (Stat.second == IS_NEG)
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- }
+
Op = Stat.first;
MachineInstr *MI = MRI.getVRegDef(Op->getReg());
- if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
- (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Hi;
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Lo;
- Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI);
- if (Statlist_Hi.size() != 0) {
- Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI);
- if (Statlist_Lo.size() != 0) {
- for (int i = Statlist_Hi.size() - 1; i >= 0; i--) {
- for (int j = Statlist_Lo.size() - 1; j >= 0; j--) {
- if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) {
- if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second,
- Mods, Statlist_Hi[i].first, RootOp, TII, MRI)) {
- return {Statlist_Hi[i].first, Mods};
- }
- }
- }
- }
+
+ if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
+ (IsDOT && Subtarget->hasDOTOpSelHazard())) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Hi;
+ Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI);
+
+ if (Statlist_Hi.size() == 0) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Lo;
+ Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI);
+
+ if (Statlist_Lo.size() == 0) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+
+ for (int i = Statlist_Hi.size() - 1; i >= 0; i--) {
+ for (int j = Statlist_Lo.size() - 1; j >= 0; j--) {
+ if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) {
+ if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, Mods,
+ Statlist_Hi[i].first, RootOp, TII, MRI))
+ return {Statlist_Hi[i].first, Mods};
}
}
}
>From 53370d8b98879a55bef1094b13a7e18195a315e6 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 09:41:17 +0800
Subject: [PATCH 08/19] fix conflict
---
llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 2003 ++++++++++++++++++++---
llvm/test/lit.cfg.py | 2 +-
2 files changed, 1811 insertions(+), 194 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 954c05e63542f..28a995e74f7ab 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,13 +1,34 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-
-; GCN-LABEL: {{^}}fadd_v2_vv:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+
define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_vv:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vv:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -16,10 +37,30 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_vs:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fadd_v2_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vs:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -28,10 +69,49 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v4_vs:
-; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
+; GFX900-LABEL: fadd_v4_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v3, s3, v3
+; GFX900-NEXT: v_add_f32_e32 v2, s2, v2
+; GFX900-NEXT: v_add_f32_e32 v1, s1, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v4_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v4_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -40,10 +120,163 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v32_vs:
-; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
+; GFX900-LABEL: fadd_v32_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v4, s43, v4
+; GFX900-NEXT: v_add_f32_e32 v3, s42, v3
+; GFX900-NEXT: v_add_f32_e32 v2, s41, v2
+; GFX900-NEXT: v_add_f32_e32 v1, s40, v1
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_add_f32_e32 v6, s37, v6
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v32, s19, v32
+; GFX900-NEXT: v_add_f32_e32 v31, s18, v31
+; GFX900-NEXT: v_add_f32_e32 v30, s17, v30
+; GFX900-NEXT: v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT: v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT: v_add_f32_e32 v12, s51, v12
+; GFX900-NEXT: v_add_f32_e32 v11, s50, v11
+; GFX900-NEXT: v_add_f32_e32 v10, s49, v10
+; GFX900-NEXT: v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT: v_add_f32_e32 v16, s47, v16
+; GFX900-NEXT: v_add_f32_e32 v15, s46, v15
+; GFX900-NEXT: v_add_f32_e32 v14, s45, v14
+; GFX900-NEXT: v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT: v_add_f32_e32 v20, s15, v20
+; GFX900-NEXT: v_add_f32_e32 v19, s14, v19
+; GFX900-NEXT: v_add_f32_e32 v18, s13, v18
+; GFX900-NEXT: v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT: v_add_f32_e32 v24, s11, v24
+; GFX900-NEXT: v_add_f32_e32 v23, s10, v23
+; GFX900-NEXT: v_add_f32_e32 v22, s9, v22
+; GFX900-NEXT: v_add_f32_e32 v21, s8, v21
+; GFX900-NEXT: v_add_f32_e32 v28, s23, v28
+; GFX900-NEXT: v_add_f32_e32 v27, s22, v27
+; GFX900-NEXT: v_add_f32_e32 v26, s21, v26
+; GFX900-NEXT: v_add_f32_e32 v25, s20, v25
+; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v32_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[38:39]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[48:49]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[50:51]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[44:45]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[46:47]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[16:17]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[18:19]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[10:11]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[20:21]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[22:23]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[36:37]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[8:9]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v32_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[40:41]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[42:43]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[44:45]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[46:47]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[48:49]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[50:51]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[8:9]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[10:11]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[16:17]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[18:19]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[20:21]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[22:23]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -53,13 +286,45 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
}
; FIXME: GISel does not use op_sel for splat constants.
-
-; GCN-LABEL: {{^}}fadd_v2_v_imm:
-; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_imm:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, 0x42c80000, v1
+; GFX900-NEXT: v_add_f32_e32 v0, 0x42c80000, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_imm:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_imm:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -68,11 +333,43 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_v_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_v_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_v_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -84,11 +381,42 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_lit_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_lit_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_lit_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -97,12 +425,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; PACKED-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
-; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_lit_hi0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, 0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fadd_v2_v_lit_hi0:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_mov_b64 s[2:3], 0x3f800000
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -111,13 +458,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; PACKED-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
-; PACKED-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
-; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}}
define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_lit_lo0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, 0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fadd_v2_v_lit_lo0:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_mov_b32 s2, 0
+; PACKED-NEXT: s_mov_b32 s3, 1.0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -126,13 +492,32 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 1.0
-; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 2.0
-; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, 2.0, v1
+; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fadd_v2_v_unfoldable_lit:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_mov_b32 s2, 1.0
+; PACKED-NEXT: s_mov_b32 s3, 2.0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -142,12 +527,47 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
}
; FIXME: Fold fneg into v_pk_add_f32 with Global ISel.
-
-; GCN-LABEL: {{^}}fadd_v2_v_fneg:
-; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
+; GFX900-LABEL: fadd_v2_v_fneg:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
+; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_fneg:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_fneg:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -159,12 +579,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
+; GFX900-LABEL: fadd_v2_v_fneg_lo:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, s2, v1
+; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s2
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -176,12 +631,47 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
+; GFX900-LABEL: fadd_v2_v_fneg_hi:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -193,12 +683,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) {
+; GFX900-LABEL: fadd_v2_v_fneg_lo2:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, s3, v1
+; GFX900-NEXT: v_subrev_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_fneg_lo2:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_fneg_lo2:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -210,12 +732,44 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
ret void
}
-; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2:
-; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) {
+; GFX900-LABEL: fadd_v2_v_fneg_hi2:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_subrev_f32_e32 v1, s2, v1
+; GFX900-NEXT: v_add_f32_e32 v0, s3, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_fneg_hi2:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_fneg_hi2:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_max_f32_e64 v3, -s2, -s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -227,10 +781,30 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_vv:
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
+; GFX900-LABEL: fmul_v2_vv:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, v0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fmul_v2_vv:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[0:1]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -239,10 +813,30 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_vs:
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fmul_v2_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v1, s3, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fmul_v2_vs:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -251,10 +845,49 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v4_vs:
-; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
+; GFX900-LABEL: fmul_v4_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v3, s3, v3
+; GFX900-NEXT: v_mul_f32_e32 v2, s2, v2
+; GFX900-NEXT: v_mul_f32_e32 v1, s1, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v4_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v4_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -263,10 +896,163 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v32_vs:
-; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
+; GFX900-LABEL: fmul_v32_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4
+; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3
+; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2
+; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32
+; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31
+; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30
+; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12
+; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11
+; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10
+; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16
+; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15
+; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14
+; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20
+; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19
+; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18
+; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24
+; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23
+; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22
+; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21
+; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28
+; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27
+; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26
+; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25
+; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v32_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[38:39]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[48:49]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[50:51]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[44:45]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[46:47]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[16:17]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[18:19]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[10:11]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[20:21]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[22:23]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[36:37]
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[8:9]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v32_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[40:41]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[42:43]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[44:45]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[46:47]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[48:49]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[50:51]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[8:9]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[10:11]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[16:17]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[18:19]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[20:21]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[22:23]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -275,12 +1061,45 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_v_imm:
-; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
+; GFX900-LABEL: fmul_v2_v_imm:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x42c80000, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, 0x42c80000, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v2_v_imm:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v2_v_imm:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; PACKED-GISEL-NEXT: s_mov_b32 s3, s2
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -289,11 +1108,43 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_v_v_splat:
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
-; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fmul_v2_v_v_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v2, v2, v0
+; GFX900-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v2_v_v_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v2_v_v_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -305,11 +1156,42 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
-; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fmul_v2_v_lit_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v1, 4.0, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v2_v_lit_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0 op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v2_v_lit_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], 4.0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -318,13 +1200,32 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit:
-; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}}
-; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
-; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
-; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
+; GFX900-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e32 v1, 0x40400000, v1
+; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fmul_v2_v_unfoldable_lit:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_mov_b32 s2, 4.0
+; PACKED-NEXT: s_mov_b32 s3, 0x40400000
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -333,11 +1234,47 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fmul_v2_v_fneg:
-; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}
-; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
+; GFX900-LABEL: fmul_v2_v_fneg:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mul_f32_e64 v1, v1, -s2
+; GFX900-NEXT: v_mul_f32_e64 v0, v0, -s2
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fmul_v2_v_fneg:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fmul_v2_v_fneg:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -349,10 +1286,30 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_vv:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
+; GFX900-LABEL: fma_v2_vv:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, v1, v1
+; GFX900-NEXT: v_fma_f32 v0, v0, v0, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fma_v2_vv:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -361,10 +1318,30 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_vs:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fma_v2_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, s3, s3
+; GFX900-NEXT: v_fma_f32 v0, v0, s2, s2
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-LABEL: fma_v2_vs:
+; PACKED: ; %bb.0:
+; PACKED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT: s_waitcnt vmcnt(0)
+; PACKED-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
+; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -373,10 +1350,49 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fma_v4_vs:
-; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; PACKED-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
+; GFX900-LABEL: fma_v4_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v3, v3, s3, s3
+; GFX900-NEXT: v_fma_f32 v2, v2, s2, s2
+; GFX900-NEXT: v_fma_f32 v1, v1, s1, s1
+; GFX900-NEXT: v_fma_f32 v0, v0, s0, s0
+; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v4_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_v4_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -385,10 +1401,163 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fma_v32_vs:
-; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; PACKED-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
+; GFX900-LABEL: fma_v32_vs:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
+; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1]
+; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
+; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
+; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
+; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
+; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43
+; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42
+; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41
+; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19
+; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18
+; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17
+; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51
+; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50
+; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49
+; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47
+; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46
+; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45
+; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15
+; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14
+; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13
+; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11
+; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10
+; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9
+; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8
+; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23
+; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22
+; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21
+; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20
+; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
+; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
+; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
+; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
+; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
+; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1]
+; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v32_vs:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1]
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[38:39], s[38:39]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(5)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[48:49], s[48:49]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[50:51], s[50:51]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(4)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[44:45], s[44:45]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[46:47], s[46:47]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[16:17], s[16:17]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[18:19], s[18:19]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[10:11], s[10:11]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[20:21], s[20:21]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[22:23], s[22:23]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[36:37], s[36:37]
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[8:9], s[8:9]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_v32_vs:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[40:41], s[40:41]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[42:43], s[42:43]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(5)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[44:45], s[44:45]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[46:47], s[46:47]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(4)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[48:49], s[48:49]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[50:51], s[50:51]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(3)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[8:9], s[8:9]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[10:11], s[10:11]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(2)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[12:13], s[12:13]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[14:15], s[14:15]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(1)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[16:17], s[16:17]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[18:19], s[18:19]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[20:21], s[20:21]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[22:23], s[22:23]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -397,14 +1566,34 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_v_imm:
-; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000
-; GFX900-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
-; PACKED-SDAG-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
-; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
+; GFX900-LABEL: fma_v2_v_imm:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX900-NEXT: v_mov_b32_e32 v3, 0x43480000
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, s2, v3
+; GFX900-NEXT: v_fma_f32 v0, v0, s2, v3
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v2_v_imm:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0x43480000
+; PACKED-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -413,11 +1602,43 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_v_v_splat:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0
-; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1]{{$}}
define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fma_v2_v_v_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v2, v2, v0, v0
+; GFX900-NEXT: v_fma_f32 v1, v1, v0, v0
+; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v2_v_v_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1] op_sel_hi:[1,0,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_v2_v_v_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[0:1], v[0:1]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -429,11 +1650,42 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
-; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fma_v2_v_lit_splat:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, 4.0, 1.0
+; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v2_v_lit_splat:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0 op_sel_hi:[1,0,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], 4.0, 1.0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -442,15 +1694,35 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit:
-; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
-; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
-; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0
-; PACKED-SDAG-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
-; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
-; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
+; GFX900-LABEL: fma_v2_v_unfoldable_lit:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_mov_b32 s2, 0x40400000
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, s2, 2.0
+; GFX900-NEXT: v_fma_f32 v0, v0, 4.0, 1.0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v2_v_unfoldable_lit:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-SDAG-NEXT: s_mov_b32 s2, 4.0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 1.0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 2.0
+; PACKED-SDAG-NEXT: s_mov_b32 s3, 0x40400000
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -459,11 +1731,47 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fma_v2_v_fneg:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
-; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
+; GFX900-LABEL: fma_v2_v_fneg:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, -s2, -s2
+; GFX900-NEXT: v_fma_f32 v0, v0, -s2, -s2
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_v2_v_fneg:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_v2_v_fneg:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -s2, -s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -475,11 +1783,51 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
ret void
}
-; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
-; GFX900-COUNT-2: v_sub_f32_e32
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
+; GFX900-LABEL: add_vector_neg_bitcast_scalar_lo:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v2, s3
+; GFX900-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX900-NEXT: ds_read_b32 v2, v2
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s3
+; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; PACKED-SDAG-NEXT: ds_read_b32 v2, v2
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo:
+; PACKED-GISEL: ; %bb.0: ; %bb
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; PACKED-GISEL-NEXT: ds_read_b32 v2, v2
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
%scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
@@ -493,11 +1841,59 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
-; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
-; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
+; GFX900-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v6, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: v_mov_b32_e32 v4, s3
+; GFX900-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; GFX900-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX900-NEXT: ds_read_b32 v5, v4
+; GFX900-NEXT: ds_read_b32 v4, v4 offset:8
+; GFX900-NEXT: s_waitcnt lgkmcnt(1)
+; GFX900-NEXT: v_fma_f32 v0, v0, v2, -v5
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_fma_f32 v1, v1, v3, -v4
+; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; PACKED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; PACKED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; PACKED-SDAG-NEXT: ds_read_b32 v4, v5
+; PACKED-SDAG-NEXT: ds_read_b32 v5, v5 offset:8
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; PACKED-GISEL: ; %bb.0: ; %bb
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, s3
+; PACKED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
+; PACKED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; PACKED-GISEL-NEXT: ds_read_b32 v4, v5
+; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
%arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2
@@ -517,11 +1913,51 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}shuffle_add_f32:
-; GFX900-COUNT-2: v_add_f32_e32
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GFX900-LABEL: shuffle_add_f32:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX900-NEXT: v_mov_b32_e32 v4, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: ds_read_b64 v[0:1], v2
+; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: shuffle_add_f32:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2
+; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: shuffle_add_f32:
+; PACKED-GISEL: ; %bb.0: ; %bb
+; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2
+; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
@@ -532,11 +1968,61 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}shuffle_neg_add_f32:
-; GFX900-COUNT-2: v_sub_f32_e32
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GFX900-LABEL: shuffle_neg_add_f32:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX900-NEXT: v_mov_b32_e32 v4, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v2, s0
+; GFX900-NEXT: ds_read_b64 v[0:1], v2
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_read_b32 v3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX900-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: shuffle_neg_add_f32:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dword s0, s[4:5], 0x2c
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; PACKED-SDAG-NEXT: ds_read_b64 v[0:1], v2
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: ds_read_b32 v3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: shuffle_neg_add_f32:
+; PACKED-GISEL: ; %bb.0: ; %bb
+; PACKED-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: ds_read_b32 v3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v2
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v3
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
%lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
@@ -549,16 +2035,26 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
-; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
-; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
-; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
+; GFX900-LABEL: fadd_fadd_fsub_0:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_add_f32_e64 v0, s1, 0
+; GFX900-NEXT: v_add_f32_e32 v1, 0, v0
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_fadd_fsub_0:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_add_f32_e64 v0, s1, 0
+; PACKED-SDAG-NEXT: v_add_f32_e32 v1, 0, v0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; PACKED-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -569,16 +2065,36 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fadd_fadd_fsub:
-; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-
-; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
-; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
+; GFX900-LABEL: fadd_fadd_fsub:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s3
+; GFX900-NEXT: v_add_f32_e32 v0, s1, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: v_add_f32_e32 v3, s2, v0
+; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1
+; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_fadd_fsub:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3
+; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
+; PACKED-SDAG-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> %arg, %arg1
%shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -589,11 +2105,48 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fadd_shuffle_v4:
-; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
+; GFX900-LABEL: fadd_shuffle_v4:
+; GFX900: ; %bb.0: ; %bb
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_add_f32_e32 v3, v3, v0
+; GFX900-NEXT: v_add_f32_e32 v2, v2, v0
+; GFX900-NEXT: v_add_f32_e32 v1, v1, v0
+; GFX900-NEXT: v_add_f32_e32 v0, v0, v0
+; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_shuffle_v4:
+; PACKED-SDAG: ; %bb.0: ; %bb
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_shuffle_v4:
+; PACKED-GISEL: ; %bb.0: ; %bb
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v0
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v0
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; PACKED-GISEL-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -604,12 +2157,44 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fneg_v2f32_vec:
-; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
-; PACKED-GISEL-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] op_sel_hi:[0,1]{{$}}
define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
+; GFX900-LABEL: fneg_v2f32_vec:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX900-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fneg_v2f32_vec:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT: s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 neg_lo:[1,1] neg_hi:[1,1]
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fneg_v2f32_vec:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; PACKED-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -618,9 +2203,41 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
ret void
}
-; GCN-LABEL: {{^}}fneg_v2f32_scalar:
-; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fneg_v2f32_scalar:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX900-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
+; PACKED-SDAG: ; %bb.0:
+; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000
+; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT: s_endpgm
+;
+; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
+; PACKED-GISEL: ; %bb.0:
+; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT: s_xor_b32 s2, s2, 0x80000000
+; PACKED-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
store <2 x float> %fneg, ptr addrspace(1) %a, align 8
ret void
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 3f178d21be053c26859291801a53291d2ceb4c9c Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 09:41:53 +0800
Subject: [PATCH 09/19] fix lit
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 79b89922430478452a3571aaab1e1cf6ee075837 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 13:37:00 +0800
Subject: [PATCH 10/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 84 +++++++++----------
1 file changed, 42 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9722ba3c7b203..48d5b8b4cdfca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4362,9 +4362,10 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
std::pair<const MachineOperand *, SrcStatus> &Curr) {
if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
- Op->isCImm() || Op->isFPImm())
+ Op->isCImm() || Op->isFPImm()) {
Curr = {Op, Stat};
- return true;
+ return true;
+ }
return false;
}
@@ -4394,11 +4395,11 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
const MachineInstr *MI = nullptr;
- if (!Curr.first->isDef()) {
+ if (!Curr.first->isDef())
MI = MRI.getVRegDef(Curr.first->getReg());
- } else {
+ else
MI = Curr.first->getParent();
- }
+
if (!MI)
return false;
@@ -4448,31 +4449,28 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- bool onlyLastSameOrNeg = false, int maxDepth = 6) {
- int depth = 0;
+ bool OnlyLastSameOrNeg = false, int MaxDepth = 6) {
+ int Depth = 0;
std::pair<const MachineOperand *, SrcStatus> Curr = {Op, IS_SAME};
SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist;
- while (depth <= maxDepth && calcNextStatus(Curr, MRI)) {
- depth++;
- if ((onlyLastSameOrNeg &&
- (Curr.second != IS_SAME && Curr.second != IS_NEG))) {
+ while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
+ Depth++;
+ if ((OnlyLastSameOrNeg &&
+ (Curr.second != IS_SAME && Curr.second != IS_NEG)))
break;
- } else if (!onlyLastSameOrNeg) {
+
+ if (!OnlyLastSameOrNeg)
Statlist.push_back(Curr);
- }
}
- if (onlyLastSameOrNeg)
+ if (OnlyLastSameOrNeg)
Statlist.push_back(Curr);
return Statlist;
}
static bool isInlinableConstant(const MachineOperand &Op,
const SIInstrInfo &TII) {
- if (Op.isFPImm())
- return TII.isInlineConstant(Op.getFPImm()->getValueAPF());
-
- return false;
+ return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF());
}
static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
@@ -4492,10 +4490,10 @@ static bool isSameOperand(const MachineOperand *Op1,
return Op1->isIdenticalTo(*Op2);
}
-static bool validToPack(SrcStatus HiStat, SrcStatus LoStat, unsigned int &Mods,
- const MachineOperand *NewOp,
- const MachineOperand *RootOp, const SIInstrInfo &TII,
- const MachineRegisterInfo &MRI) {
+static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
+ unsigned int &Mods, const MachineOperand *NewOp,
+ const MachineOperand *RootOp, const SIInstrInfo &TII,
+ const MachineRegisterInfo &MRI) {
if (NewOp->isReg()) {
if (isSameBitWidth(NewOp, RootOp, MRI)) {
// IS_LOWER_HALF remain 0
@@ -4557,29 +4555,28 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
return {Op, Mods};
}
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Hi;
- Statlist_Hi = getSrcStats(&MI->getOperand(2), MRI);
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi;
+ StatlistHi = getSrcStats(&MI->getOperand(2), MRI);
- if (Statlist_Hi.size() == 0) {
+ if (StatlistHi.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist_Lo;
- Statlist_Lo = getSrcStats(&MI->getOperand(1), MRI);
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo;
+ StatlistLo = getSrcStats(&MI->getOperand(1), MRI);
- if (Statlist_Lo.size() == 0) {
+ if (StatlistLo.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- for (int i = Statlist_Hi.size() - 1; i >= 0; i--) {
- for (int j = Statlist_Lo.size() - 1; j >= 0; j--) {
- if (isSameOperand(Statlist_Hi[i].first, Statlist_Lo[j].first)) {
- if (validToPack(Statlist_Hi[i].second, Statlist_Lo[j].second, Mods,
- Statlist_Hi[i].first, RootOp, TII, MRI))
- return {Statlist_Hi[i].first, Mods};
- }
+ for (int i = StatlistHi.size() - 1; i >= 0; i--) {
+ for (int j = StatlistLo.size() - 1; j >= 0; j--) {
+ if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) &&
+ isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods,
+ StatlistHi[i].first, RootOp, TII, MRI))
+ return {StatlistHi[i].first, Mods};
}
}
// Packed instructions do not have abs modifiers.
@@ -4596,13 +4593,15 @@ int64_t getAllKindImm(const MachineOperand *Op) {
return Op->getCImm()->getSExtValue();
case MachineOperand::MachineOperandType::MO_FPImmediate:
return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
+ default:
+ llvm_unreachable("not an imm type");
}
- llvm_unreachable("not an imm type");
}
-bool checkRB(const MachineOperand *Op, int RBNo,
- const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) {
+static bool checkRB(const MachineOperand *Op, unsigned int RBNo,
+ const AMDGPURegisterBankInfo &RBI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI);
return RB->getID() == RBNo;
}
@@ -4619,17 +4618,18 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
if (MI->getOpcode() == AMDGPU::COPY &&
- isSameOperand(NewOp, &MI->getOperand(1)))
+ isSameOperand(NewOp, &MI->getOperand(1))) {
// RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp
return RootOp;
+ }
MachineBasicBlock *BB = MI->getParent();
const TargetRegisterClass *DstRC =
TRI.getConstrainedRegClassForOperand(*RootOp, MRI);
- Register dstReg = MRI.createVirtualRegister(DstRC);
+ Register DstReg = MRI.createVirtualRegister(DstRC);
MachineInstrBuilder MIB =
- BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), dstReg)
+ BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
.addReg(NewOp->getReg());
// only accept VGPR
>From 136da478585e4963f081a991b5dcfab55c884879 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 18 Mar 2025 17:20:56 +0800
Subject: [PATCH 11/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 95 ++++++++++---------
1 file changed, 49 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 48d5b8b4cdfca..e63abe667842f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4310,7 +4310,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-enum SrcStatus {
+enum class SrcStatus {
IS_SAME,
IS_UPPER_HALF,
IS_LOWER_HALF,
@@ -4372,18 +4372,18 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
SrcStatus getNegStatus(SrcStatus S) {
switch (S) {
- case IS_SAME:
- return IS_NEG;
- case IS_UPPER_HALF:
- return IS_UPPER_HALF_NEG;
- case IS_LOWER_HALF:
- return IS_LOWER_HALF_NEG;
- case IS_NEG:
- return IS_SAME;
- case IS_UPPER_HALF_NEG:
- return IS_UPPER_HALF;
- case IS_LOWER_HALF_NEG:
- return IS_LOWER_HALF;
+ case SrcStatus::IS_SAME:
+ return SrcStatus::IS_NEG;
+ case SrcStatus::IS_UPPER_HALF:
+ return SrcStatus::IS_UPPER_HALF_NEG;
+ case SrcStatus::IS_LOWER_HALF:
+ return SrcStatus::IS_LOWER_HALF_NEG;
+ case SrcStatus::IS_NEG:
+ return SrcStatus::IS_SAME;
+ case SrcStatus::IS_UPPER_HALF_NEG:
+ return SrcStatus::IS_UPPER_HALF;
+ case SrcStatus::IS_LOWER_HALF_NEG:
+ return SrcStatus::IS_LOWER_HALF;
}
llvm_unreachable("unexpected SrcStatus");
}
@@ -4405,7 +4405,7 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
unsigned Opc = MI->getOpcode();
- // Handle general Opc cases
+ // Handle general Opc cases.
switch (Opc) {
case AMDGPU::G_BITCAST:
case AMDGPU::G_CONSTANT:
@@ -4413,35 +4413,38 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
case AMDGPU::COPY:
return retOpStat(&MI->getOperand(1), Curr.second, Curr);
case AMDGPU::G_FNEG:
- // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX
return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr);
+ default:
+ break;
}
- // Calc next Stat from current Stat
+ // Calc next Stat from current Stat.
switch (Curr.second) {
- case IS_SAME:
+ case SrcStatus::IS_SAME:
if (isTruncHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr);
break;
- case IS_NEG:
+ case SrcStatus::IS_NEG:
if (isTruncHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr);
break;
- case IS_UPPER_HALF:
+ case SrcStatus::IS_UPPER_HALF:
if (isShlHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr);
break;
- case IS_LOWER_HALF:
+ case SrcStatus::IS_LOWER_HALF:
if (isLshrHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF, Curr);
break;
- case IS_UPPER_HALF_NEG:
+ case SrcStatus::IS_UPPER_HALF_NEG:
if (isShlHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr);
break;
- case IS_LOWER_HALF_NEG:
+ case SrcStatus::IS_LOWER_HALF_NEG:
if (isLshrHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, Curr);
+ return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF_NEG, Curr);
+ break;
+ default:
break;
}
return false;
@@ -4451,13 +4454,13 @@ SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool OnlyLastSameOrNeg = false, int MaxDepth = 6) {
int Depth = 0;
- std::pair<const MachineOperand *, SrcStatus> Curr = {Op, IS_SAME};
+ std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist;
while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
Depth++;
- if ((OnlyLastSameOrNeg &&
- (Curr.second != IS_SAME && Curr.second != IS_NEG)))
+ if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME &&
+ Curr.second != SrcStatus::IS_NEG)))
break;
if (!OnlyLastSameOrNeg)
@@ -4496,35 +4499,35 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
const MachineRegisterInfo &MRI) {
if (NewOp->isReg()) {
if (isSameBitWidth(NewOp, RootOp, MRI)) {
- // IS_LOWER_HALF remain 0
- if (HiStat == IS_UPPER_HALF_NEG) {
+ // SrcStatus::IS_LOWER_HALF remain 0.
+ if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
Mods ^= SISrcMods::NEG_HI;
Mods |= SISrcMods::OP_SEL_1;
- } else if (HiStat == IS_UPPER_HALF) {
+ } else if (HiStat == SrcStatus::IS_UPPER_HALF) {
Mods |= SISrcMods::OP_SEL_1;
- } else if (HiStat == IS_LOWER_HALF_NEG) {
+ } else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) {
Mods ^= SISrcMods::NEG_HI;
}
- if (LoStat == IS_UPPER_HALF_NEG) {
+ if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
Mods ^= SISrcMods::NEG;
Mods |= SISrcMods::OP_SEL_0;
- } else if (LoStat == IS_UPPER_HALF) {
+ } else if (LoStat == SrcStatus::IS_UPPER_HALF) {
Mods |= SISrcMods::OP_SEL_0;
- } else if (LoStat == IS_UPPER_HALF_NEG) {
+ } else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
Mods |= SISrcMods::NEG;
}
return true;
}
} else {
- if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
- (LoStat == IS_SAME || LoStat == IS_NEG) &&
+ if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_NEG) &&
+ (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_NEG) &&
isInlinableConstant(*NewOp, TII)) {
- if (HiStat == IS_NEG)
+ if (HiStat == SrcStatus::IS_NEG)
Mods ^= SISrcMods::NEG_HI;
- if (LoStat == IS_NEG)
+ if (LoStat == SrcStatus::IS_NEG)
Mods ^= SISrcMods::NEG;
// opsel = opsel_hi = 0, since the upper half and lower half both
- // the same as the target inlinable constant
+ // the same as the target inlinable constant.
return true;
}
}
@@ -4543,7 +4546,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- if (Stat.second == IS_NEG)
+ if (Stat.second == SrcStatus::IS_NEG)
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Op = Stat.first;
@@ -4611,7 +4614,7 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
// RootOp can only be VGPR or SGPR (some hand written cases such as
- // inst-select-ashr.v2s16.mir::ashr_v2s16_vs)
+ // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
return NewOp;
@@ -4619,7 +4622,7 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
if (MI->getOpcode() == AMDGPU::COPY &&
isSameOperand(NewOp, &MI->getOperand(1))) {
- // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp
+ // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
return RootOp;
}
>From fc7c927a4b56afb1cc3a1431ba87cbd185242aee Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 24 Mar 2025 10:57:20 +0800
Subject: [PATCH 12/19] Block for root type other than 2 x Type
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 206 ++++++++++++++++--
.../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 3 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 6 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 18 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 12 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll | 12 +-
8 files changed, 237 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0a5d0dede02e0..cb34986d8c77e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4318,9 +4318,12 @@ enum class SrcStatus {
IS_SAME,
IS_UPPER_HALF,
IS_LOWER_HALF,
- IS_NEG,
+ IS_HI_NEG,
+ IS_LO_NEG,
+ IS_BOTH_NEG,
IS_UPPER_HALF_NEG,
- IS_LOWER_HALF_NEG
+ IS_LOWER_HALF_NEG,
+ INVALID
};
static bool isTruncHalf(const MachineInstr *MI,
@@ -4365,8 +4368,9 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
std::pair<const MachineOperand *, SrcStatus> &Curr) {
- if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
- Op->isCImm() || Op->isFPImm()) {
+ if (Stat != SrcStatus::INVALID &&
+ ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
+ Op->isCImm() || Op->isFPImm())) {
Curr = {Op, Stat};
return true;
}
@@ -4374,20 +4378,164 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
return false;
}
-SrcStatus getNegStatus(SrcStatus S) {
+// 0 = Vector of 2,
+// 1 = Scalar
+// -1 = non of them
+static int isVectorOfTwoOrScalar(const MachineOperand *Op,
+ const MachineRegisterInfo &MRI) {
+ if (!Op->isReg() || Op->getReg().isPhysical())
+ return -1;
+ LLT OpTy = MRI.getType(Op->getReg());
+ if (OpTy.isScalar())
+ return 1;
+ if (OpTy.isVector() && OpTy.getNumElements() == 2)
+ return 0;
+ return -1;
+}
+
+SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
+ const MachineRegisterInfo &MRI) {
+ int NegType = isVectorOfTwoOrScalar(Op, MRI);
+ if (NegType != 0 && NegType != 1)
+ return SrcStatus::INVALID;
+
switch (S) {
case SrcStatus::IS_SAME:
- return SrcStatus::IS_NEG;
+ if (NegType == 0) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-OpHi, -OpLo]
+ return SrcStatus::IS_BOTH_NEG;
+ } else if (NegType == 1) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-OpHi, OpLo]
+ return SrcStatus::IS_HI_NEG;
+ }
+ break;
+ case SrcStatus::IS_HI_NEG:
+ if (NegType == 0) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
+ return SrcStatus::IS_LO_NEG;
+ } else if (NegType == 1) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
+ return SrcStatus::IS_SAME;
+ }
+ break;
+ case SrcStatus::IS_LO_NEG:
+ if (NegType == 0) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
+ return SrcStatus::IS_HI_NEG;
+ } else if (NegType == 1) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-OpHi, -OpLo]
+ return SrcStatus::IS_BOTH_NEG;
+ }
+ break;
+ case SrcStatus::IS_BOTH_NEG:
+ if (NegType == 0) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [OpHi, OpLo]
+ return SrcStatus::IS_SAME;
+ } else if (NegType == 1) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [OpHi, -OpLo]
+ return SrcStatus::IS_LO_NEG;
+ }
+ break;
case SrcStatus::IS_UPPER_HALF:
+ // Vector of 2:
+ // Src = CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -OpUpper
+ //
+ // Scalar:
+ // Src = CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -OpUpper
return SrcStatus::IS_UPPER_HALF_NEG;
case SrcStatus::IS_LOWER_HALF:
- return SrcStatus::IS_LOWER_HALF_NEG;
- case SrcStatus::IS_NEG:
- return SrcStatus::IS_SAME;
+ if (NegType == 0) {
+ // Vector of 2:
+ // Src = CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -OpLower
+ return SrcStatus::IS_LOWER_HALF_NEG;
+ } else if (NegType == 1) {
+ // Scalar:
+ // Src = CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = OpLower
+ return SrcStatus::IS_LOWER_HALF;
+ }
+ break;
case SrcStatus::IS_UPPER_HALF_NEG:
+ // Vector of 2:
+ // Src = -CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -(-OpUpper) = OpUpper
+ //
+ // Scalar:
+ // Src = -CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -(-OpUpper) = OpUpper
return SrcStatus::IS_UPPER_HALF;
case SrcStatus::IS_LOWER_HALF_NEG:
- return SrcStatus::IS_LOWER_HALF;
+ if (NegType == 0) {
+ // Vector of 2:
+ // Src = -CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -(-OpLower) = OpLower
+ return SrcStatus::IS_LOWER_HALF_NEG;
+ } else if (NegType == 1) {
+ // Scalar:
+ // Src = -CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -OpLower
+ return SrcStatus::IS_LOWER_HALF;
+ }
+ break;
}
llvm_unreachable("unexpected SrcStatus");
}
@@ -4417,7 +4565,8 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
case AMDGPU::COPY:
return retOpStat(&MI->getOperand(1), Curr.second, Curr);
case AMDGPU::G_FNEG:
- return retOpStat(&MI->getOperand(1), getNegStatus(Curr.second), Curr);
+ return retOpStat(&MI->getOperand(1),
+ getNegStatus(Curr.first, Curr.second, MRI), Curr);
default:
break;
}
@@ -4428,9 +4577,16 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
if (isTruncHalf(MI, MRI))
return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr);
break;
- case SrcStatus::IS_NEG:
- if (isTruncHalf(MI, MRI))
+ case SrcStatus::IS_HI_NEG:
+ if (isTruncHalf(MI, MRI)) {
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
+ // = [OpLowerHi, OpLowerLo]
+ // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // = [-OpLowerHi, OpLowerLo]
+ // = -OpLower
return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr);
+ }
break;
case SrcStatus::IS_UPPER_HALF:
if (isShlHalf(MI, MRI))
@@ -4464,7 +4620,9 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
Depth++;
if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME &&
- Curr.second != SrcStatus::IS_NEG)))
+ Curr.second != SrcStatus::IS_HI_NEG &&
+ Curr.second != SrcStatus::IS_LO_NEG &&
+ Curr.second != SrcStatus::IS_BOTH_NEG)))
break;
if (!OnlyLastSameOrNeg)
@@ -4523,12 +4681,12 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
return true;
}
} else {
- if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_NEG) &&
- (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_NEG) &&
+ if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
+ (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
isInlinableConstant(*NewOp, TII)) {
- if (HiStat == SrcStatus::IS_NEG)
+ if (HiStat == SrcStatus::IS_HI_NEG)
Mods ^= SISrcMods::NEG_HI;
- if (LoStat == SrcStatus::IS_NEG)
+ if (LoStat == SrcStatus::IS_HI_NEG)
Mods ^= SISrcMods::NEG;
// opsel = opsel_hi = 0, since the upper half and lower half both
// the same as the target inlinable constant.
@@ -4543,6 +4701,12 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
const MachineRegisterInfo &MRI,
bool IsDOT) const {
unsigned Mods = 0;
+ // No modification if Root type is not form of <2 x Type>
+ if (isVectorOfTwoOrScalar(Op, MRI) != 0) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+
const MachineOperand *RootOp = Op;
std::pair<const MachineOperand *, SrcStatus> Stat =
getSrcStats(Op, MRI, true)[0];
@@ -4550,8 +4714,12 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- if (Stat.second == SrcStatus::IS_NEG)
+ if (Stat.second == SrcStatus::IS_BOTH_NEG)
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ else if (Stat.second == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+ else if (Stat.second == SrcStatus::IS_LO_NEG)
+ Mods ^= SISrcMods::NEG;
Op = Stat.first;
MachineInstr *MI = MRI.getVRegDef(Op->getReg());
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 2243c57cf37ac..1d9514c58ab9c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -68,7 +68,8 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index edb09e0ad646b..8f0ae8c47098a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -248,7 +248,8 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_sdot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegf32_c:
@@ -262,7 +263,8 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX10-LABEL: v_sdot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
@@ -274,7 +276,8 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
@@ -288,7 +291,8 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index d6ef48e25cafb..06560afee3c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -91,7 +91,8 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegf32_a:
@@ -111,7 +112,8 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot4_fnegv2f16_a:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index d2aa47df81cbe..0d729351f65a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -47,13 +47,15 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegf32_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -65,13 +67,15 @@ define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot8_fnegv2f16_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index b7cf49bbfab20..287a009ca1405 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -235,19 +235,22 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_udot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegf32_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
@@ -259,19 +262,22 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_udot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_fnegv2f16_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 7ad0404942feb..b14af9e043e09 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -112,13 +112,15 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot4_fnegf32_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -130,13 +132,15 @@ define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot4_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10PLUS-NEXT: v_dot4_u32_u8 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index 52763bbc24e40..a664c8aa508ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -48,13 +48,15 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot8_fnegf32_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg float %a
%cast.neg.a = bitcast float %neg.a to i32
@@ -66,13 +68,15 @@ define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_udot8_fnegv2f16_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10PLUS-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10PLUS-NEXT: v_dot8_u32_u4 v0, v0, v1, v2
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%neg.a = fneg <2 x half> %a
%cast.neg.a = bitcast <2 x half> %neg.a to i32
>From bc51bf48144423540958effc4a7487d735787add Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 24 Mar 2025 18:03:43 +0800
Subject: [PATCH 13/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 85 +++++++--
.../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll | 163 ++++++++++++++++++
2 files changed, 229 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cb34986d8c77e..f43203f437484 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4318,12 +4318,16 @@ enum class SrcStatus {
IS_SAME,
IS_UPPER_HALF,
IS_LOWER_HALF,
+ IS_UPPER_HALF_NEG,
+ IS_LOWER_HALF_NEG,
IS_HI_NEG,
IS_LO_NEG,
IS_BOTH_NEG,
- IS_UPPER_HALF_NEG,
- IS_LOWER_HALF_NEG,
- INVALID
+ INVALID,
+ NEG_START = IS_UPPER_HALF_NEG,
+ NEG_END = IS_BOTH_NEG,
+ HALF_START = IS_UPPER_HALF,
+ HALF_END = IS_LOWER_HALF_NEG
};
static bool isTruncHalf(const MachineInstr *MI,
@@ -4525,7 +4529,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
// [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
// Src = -(-OpLower) = OpLower
- return SrcStatus::IS_LOWER_HALF_NEG;
+ return SrcStatus::IS_LOWER_HALF;
} else if (NegType == 1) {
// Scalar:
// Src = -CurrLower
@@ -4533,7 +4537,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
// [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
// Src = -OpLower
- return SrcStatus::IS_LOWER_HALF;
+ return SrcStatus::IS_LOWER_HALF_NEG;
}
break;
}
@@ -4610,26 +4614,69 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
return false;
}
+struct {
+ unsigned int HasNeg : 1;
+ unsigned int HasOpsel : 1;
+} StatOptions;
+
+static bool checkOptions(SrcStatus Stat) {
+ if (!StatOptions.HasNeg &&
+ (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) {
+ return false;
+ }
+ if (!StatOptions.HasOpsel &&
+ (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) {
+ return false;
+ }
+ return true;
+}
+
+void setUpOptions(const MachineOperand *RootOp,
+ const MachineRegisterInfo &MRI) {
+ const MachineInstr *MI = RootOp->getParent();
+ unsigned Opc = MI->getOpcode();
+
+ if (Opc < TargetOpcode::GENERIC_OP_END) {
+ // Keep same for gerneric op
+ StatOptions.HasNeg = 1;
+ } else if (Opc == TargetOpcode::G_INTRINSIC) {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
+ // Only float point intrinsic has neg & neg_hi bits
+ if (IntrinsicID == Intrinsic::amdgcn_fdot2)
+ StatOptions.HasNeg = 1;
+ else
+ StatOptions.HasNeg = 0;
+ } else
+ StatOptions.HasNeg = 0;
+
+ // Assume all complex pattern of VOP3P has opsel
+ StatOptions.HasOpsel = 1;
+}
+
SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool OnlyLastSameOrNeg = false, int MaxDepth = 6) {
int Depth = 0;
std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> Statlist;
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
+
+ if (OnlyLastSameOrNeg)
+ Statlist.push_back(Curr);
while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
Depth++;
- if ((OnlyLastSameOrNeg && (Curr.second != SrcStatus::IS_SAME &&
- Curr.second != SrcStatus::IS_HI_NEG &&
- Curr.second != SrcStatus::IS_LO_NEG &&
- Curr.second != SrcStatus::IS_BOTH_NEG)))
- break;
-
- if (!OnlyLastSameOrNeg)
- Statlist.push_back(Curr);
+ if (checkOptions(Curr.second)) {
+ if (OnlyLastSameOrNeg && (Curr.second == SrcStatus::IS_SAME ||
+ Curr.second == SrcStatus::IS_HI_NEG ||
+ Curr.second == SrcStatus::IS_LO_NEG ||
+ Curr.second == SrcStatus::IS_BOTH_NEG))
+ Statlist[0] = Curr;
+
+ if (!OnlyLastSameOrNeg)
+ Statlist.push_back(Curr);
+ }
}
- if (OnlyLastSameOrNeg)
- Statlist.push_back(Curr);
+
return Statlist;
}
@@ -4648,9 +4695,7 @@ static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
static bool isSameOperand(const MachineOperand *Op1,
const MachineOperand *Op2) {
if (Op1->isReg()) {
- if (Op2->isReg())
- return Op1->getReg() == Op2->getReg();
- return false;
+ return Op2->isReg() && Op1->getReg() == Op2->getReg();
}
return Op1->isIdenticalTo(*Op2);
}
@@ -4707,6 +4752,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
return {Op, Mods};
}
+ setUpOptions(Op, MRI);
+
const MachineOperand *RootOp = Op;
std::pair<const MachineOperand *, SrcStatus> Stat =
getSrcStats(Op, MRI, true)[0];
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 543f8e413abd8..9f0641b715d36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -106,6 +106,169 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b)
ret <2 x half> %mul
}
+define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16_partial_neg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16_partial_neg:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v2f16_partial_neg:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
+; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = fneg <2 x half> %b3
+ %mul1 = fmul <2 x half> %b3, %a
+ %mul2 = fmul <2 x half> %b4, %mul1
+ ret <2 x half> %mul2
+}
+
+define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_hi:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_hi:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_hi:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = extractelement <2 x half> %b3, i64 1
+ %tmp = insertelement <2 x half> poison, half %b4, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
+define <2 x half> @fmul_v2_half_neg_hi1(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_hi1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_hi1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_hi1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = fneg <2 x half> %b3
+ %b5 = extractelement <2 x half> %b4, i64 1
+ %tmp = insertelement <2 x half> poison, half %b5, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
+define <2 x half> @fmul_v2_half_neg_lo(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_lo:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_lo:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_lo:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = fneg <2 x half> %b3
+ %b5 = extractelement <2 x half> %b4, i64 0
+ %tmp = insertelement <2 x half> poison, half %b5, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
+define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_lo1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_lo1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_lo1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = extractelement <2 x half> %b3, i64 0
+ %tmp = insertelement <2 x half> poison, half %b4, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
; GFX9-LABEL: v_fmul_v3f16:
; GFX9: ; %bb.0:
>From 47840d7832636f2d393d9d848ff86aaafd3a8dc7 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 26 Mar 2025 10:37:45 +0800
Subject: [PATCH 14/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 194 ++++++++++--------
1 file changed, 106 insertions(+), 88 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f43203f437484..c6bd40dc51fcb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4382,37 +4382,36 @@ static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
return false;
}
-// 0 = Vector of 2,
-// 1 = Scalar
-// -1 = non of them
-static int isVectorOfTwoOrScalar(const MachineOperand *Op,
- const MachineRegisterInfo &MRI) {
+enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED };
+
+static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op,
+ const MachineRegisterInfo &MRI) {
if (!Op->isReg() || Op->getReg().isPhysical())
- return -1;
+ return TypeClass::NON_OF_LISTED;
LLT OpTy = MRI.getType(Op->getReg());
if (OpTy.isScalar())
- return 1;
+ return TypeClass::SCALAR;
if (OpTy.isVector() && OpTy.getNumElements() == 2)
- return 0;
- return -1;
+ return TypeClass::VECTOR_OF_TWO;
+ return TypeClass::NON_OF_LISTED;
}
SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
const MachineRegisterInfo &MRI) {
- int NegType = isVectorOfTwoOrScalar(Op, MRI);
- if (NegType != 0 && NegType != 1)
+ TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI);
+ if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
return SrcStatus::INVALID;
switch (S) {
case SrcStatus::IS_SAME:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// [SrcHi, SrcLo] = [CurrHi, CurrLo]
// [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
// [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
// [SrcHi, SrcLo] = [-OpHi, -OpLo]
return SrcStatus::IS_BOTH_NEG;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// [SrcHi, SrcLo] = [CurrHi, CurrLo]
// [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
@@ -4422,14 +4421,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
}
break;
case SrcStatus::IS_HI_NEG:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// [SrcHi, SrcLo] = [-CurrHi, CurrLo]
// [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
// [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
// [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
return SrcStatus::IS_LO_NEG;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// [SrcHi, SrcLo] = [-CurrHi, CurrLo]
// [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
@@ -4439,14 +4438,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
}
break;
case SrcStatus::IS_LO_NEG:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// [SrcHi, SrcLo] = [CurrHi, -CurrLo]
// [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
// [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
// [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
return SrcStatus::IS_HI_NEG;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// [SrcHi, SrcLo] = [CurrHi, -CurrLo]
// [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
@@ -4456,14 +4455,14 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
}
break;
case SrcStatus::IS_BOTH_NEG:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
// [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
// [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
// [SrcHi, SrcLo] = [OpHi, OpLo]
return SrcStatus::IS_SAME;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
// [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
@@ -4488,7 +4487,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// Src = -OpUpper
return SrcStatus::IS_UPPER_HALF_NEG;
case SrcStatus::IS_LOWER_HALF:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// Src = CurrLower
// Curr = [CurrUpper, CurrLower]
@@ -4496,7 +4495,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
// Src = -OpLower
return SrcStatus::IS_LOWER_HALF_NEG;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// Src = CurrLower
// Curr = [CurrUpper, CurrLower]
@@ -4522,7 +4521,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// Src = -(-OpUpper) = OpUpper
return SrcStatus::IS_UPPER_HALF;
case SrcStatus::IS_LOWER_HALF_NEG:
- if (NegType == 0) {
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
// Vector of 2:
// Src = -CurrLower
// Curr = [CurrUpper, CurrLower]
@@ -4530,7 +4529,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
// [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
// Src = -(-OpLower) = OpLower
return SrcStatus::IS_LOWER_HALF;
- } else if (NegType == 1) {
+ } else if (NegType == TypeClass::SCALAR) {
// Scalar:
// Src = -CurrLower
// Curr = [CurrUpper, CurrLower]
@@ -4615,8 +4614,8 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
}
struct {
- unsigned int HasNeg : 1;
- unsigned int HasOpsel : 1;
+ bool HasNeg;
+ bool HasOpsel;
} StatOptions;
static bool checkOptions(SrcStatus Stat) {
@@ -4653,33 +4652,44 @@ void setUpOptions(const MachineOperand *RootOp,
StatOptions.HasOpsel = 1;
}
-SmallVector<std::pair<const MachineOperand *, SrcStatus>>
+static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- bool OnlyLastSameOrNeg = false, int MaxDepth = 6) {
+ int MaxDepth = 6) {
int Depth = 0;
std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
- if (OnlyLastSameOrNeg)
- Statlist.push_back(Curr);
-
while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
Depth++;
if (checkOptions(Curr.second)) {
- if (OnlyLastSameOrNeg && (Curr.second == SrcStatus::IS_SAME ||
- Curr.second == SrcStatus::IS_HI_NEG ||
- Curr.second == SrcStatus::IS_LO_NEG ||
- Curr.second == SrcStatus::IS_BOTH_NEG))
- Statlist[0] = Curr;
-
- if (!OnlyLastSameOrNeg)
- Statlist.push_back(Curr);
+ Statlist.push_back(Curr);
}
}
return Statlist;
}
+static std::pair<const MachineOperand *, SrcStatus>
+getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
+ int MaxDepth = 6) {
+ int Depth = 0;
+ std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
+ std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = Curr;
+
+ while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
+ Depth++;
+ if (checkOptions(Curr.second)) {
+ if (Curr.second == SrcStatus::IS_SAME ||
+ Curr.second == SrcStatus::IS_HI_NEG ||
+ Curr.second == SrcStatus::IS_LO_NEG ||
+ Curr.second == SrcStatus::IS_BOTH_NEG)
+ LastSameOrNeg = Curr;
+ }
+ }
+
+ return LastSameOrNeg;
+}
+
static bool isInlinableConstant(const MachineOperand &Op,
const SIInstrInfo &TII) {
return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF());
@@ -4700,41 +4710,51 @@ static bool isSameOperand(const MachineOperand *Op1,
return Op1->isIdenticalTo(*Op2);
}
+unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
+ // SrcStatus::IS_LOWER_HALF remain 0.
+ if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == SrcStatus::IS_UPPER_HALF)
+ Mods |= SISrcMods::OP_SEL_1;
+ else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+ else if (HiStat == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+
+ if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG;
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == SrcStatus::IS_UPPER_HALF)
+ Mods |= SISrcMods::OP_SEL_0;
+ else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG)
+ Mods |= SISrcMods::NEG;
+ else if (LoStat == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG;
+
+ return Mods;
+}
+
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
unsigned int &Mods, const MachineOperand *NewOp,
const MachineOperand *RootOp, const SIInstrInfo &TII,
const MachineRegisterInfo &MRI) {
if (NewOp->isReg()) {
- if (isSameBitWidth(NewOp, RootOp, MRI)) {
- // SrcStatus::IS_LOWER_HALF remain 0.
- if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
- Mods ^= SISrcMods::NEG_HI;
- Mods |= SISrcMods::OP_SEL_1;
- } else if (HiStat == SrcStatus::IS_UPPER_HALF) {
- Mods |= SISrcMods::OP_SEL_1;
- } else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) {
- Mods ^= SISrcMods::NEG_HI;
- }
- if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
- Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_0;
- } else if (LoStat == SrcStatus::IS_UPPER_HALF) {
- Mods |= SISrcMods::OP_SEL_0;
- } else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
- Mods |= SISrcMods::NEG;
- }
+ if (isSameBitWidth(NewOp, RootOp, MRI) &&
+ (HiStat == SrcStatus::IS_UPPER_HALF ||
+ HiStat == SrcStatus::IS_UPPER_HALF_NEG ||
+ HiStat == SrcStatus::IS_LOWER_HALF ||
+ HiStat == SrcStatus::IS_LOWER_HALF_NEG) &&
+ (LoStat == SrcStatus::IS_UPPER_HALF ||
+ LoStat == SrcStatus::IS_UPPER_HALF_NEG ||
+ LoStat == SrcStatus::IS_LOWER_HALF ||
+ LoStat == SrcStatus::IS_LOWER_HALF_NEG)) {
return true;
}
} else {
if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
(LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
isInlinableConstant(*NewOp, TII)) {
- if (HiStat == SrcStatus::IS_HI_NEG)
- Mods ^= SISrcMods::NEG_HI;
- if (LoStat == SrcStatus::IS_HI_NEG)
- Mods ^= SISrcMods::NEG;
- // opsel = opsel_hi = 0, since the upper half and lower half both
- // the same as the target inlinable constant.
return true;
}
}
@@ -4742,21 +4762,20 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
}
std::pair<const MachineOperand *, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
+AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
const MachineRegisterInfo &MRI,
bool IsDOT) const {
unsigned Mods = 0;
+ const MachineOperand *Op = RootOp;
// No modification if Root type is not form of <2 x Type>
- if (isVectorOfTwoOrScalar(Op, MRI) != 0) {
+ if (isVectorOfTwoOrScalar(Op, MRI) != TypeClass::VECTOR_OF_TWO) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
setUpOptions(Op, MRI);
- const MachineOperand *RootOp = Op;
- std::pair<const MachineOperand *, SrcStatus> Stat =
- getSrcStats(Op, MRI, true)[0];
+ std::pair<const MachineOperand *, SrcStatus> Stat = getLastSameOrNeg(Op, MRI);
if (!Stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
@@ -4777,16 +4796,16 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
return {Op, Mods};
}
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi;
- StatlistHi = getSrcStats(&MI->getOperand(2), MRI);
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi =
+ getSrcStats(&MI->getOperand(2), MRI);
if (StatlistHi.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
}
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo;
- StatlistLo = getSrcStats(&MI->getOperand(1), MRI);
+ SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo =
+ getSrcStats(&MI->getOperand(1), MRI);
if (StatlistLo.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
@@ -4798,7 +4817,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *Op,
if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) &&
isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods,
StatlistHi[i].first, RootOp, TII, MRI))
- return {StatlistHi[i].first, Mods};
+ return {StatlistHi[i].first,
+ updateMods(StatlistHi[i].second, StatlistLo[j].second, Mods)};
}
}
// Packed instructions do not have abs modifiers.
@@ -4863,18 +4883,17 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<const MachineOperand *, unsigned> Res =
- selectVOP3PModsImpl(&Root, MRI);
- if (!(Res.first->isReg()))
+ auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI);
+ if (!(Op->isReg()))
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
- Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII);
+ Op = getVReg(Op, &Root, RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
@@ -4883,18 +4902,17 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- std::pair<const MachineOperand *, unsigned> Res =
- selectVOP3PModsImpl(&Root, MRI, true);
- if (!(Res.first->isReg()))
+ auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, true);
+ if (!(Op->isReg()))
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Res.first)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
- Res.first = getVReg(Res.first, &Root, RBI, MRI, TRI, TII);
+ Op = getVReg(Op, &Root, RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Res.first->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Res.second); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
>From 6fe41472c046a58e7a55cd84a49ad64475ec9987 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 26 Mar 2025 11:15:54 +0800
Subject: [PATCH 15/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 57 ++++++++++---------
1 file changed, 31 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c6bd40dc51fcb..18b8bdd54c612 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4370,16 +4370,17 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
return false;
}
-static bool retOpStat(const MachineOperand *Op, SrcStatus Stat,
- std::pair<const MachineOperand *, SrcStatus> &Curr) {
+std::optional<std::pair<const MachineOperand *, SrcStatus>>
+retOpStat(const MachineOperand *Op, SrcStatus Stat,
+ std::pair<const MachineOperand *, SrcStatus> &Curr) {
if (Stat != SrcStatus::INVALID &&
((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
Op->isCImm() || Op->isFPImm())) {
- Curr = {Op, Stat};
- return true;
+ return std::optional<std::pair<const MachineOperand *, SrcStatus>>(
+ {Op, Stat});
}
- return false;
+ return std::nullopt;
}
enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED };
@@ -4543,10 +4544,11 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
llvm_unreachable("unexpected SrcStatus");
}
-static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
- const MachineRegisterInfo &MRI) {
+std::optional<std::pair<const MachineOperand *, SrcStatus>>
+calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr,
+ const MachineRegisterInfo &MRI) {
if (!Curr.first->isReg())
- return false;
+ return std::nullopt;
const MachineInstr *MI = nullptr;
@@ -4556,7 +4558,7 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
MI = Curr.first->getParent();
if (!MI)
- return false;
+ return std::nullopt;
unsigned Opc = MI->getOpcode();
@@ -4610,7 +4612,7 @@ static bool calcNextStatus(std::pair<const MachineOperand *, SrcStatus> &Curr,
default:
break;
}
- return false;
+ return std::nullopt;
}
struct {
@@ -4656,14 +4658,15 @@ static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
int MaxDepth = 6) {
int Depth = 0;
- std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
+ auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI);
SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
- while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
+ while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (checkOptions(Curr.second)) {
- Statlist.push_back(Curr);
+ if (checkOptions(Curr.value().second)) {
+ Statlist.push_back(Curr.value());
}
+ Curr = calcNextStatus(Curr.value(), MRI);
}
return Statlist;
@@ -4673,25 +4676,27 @@ static std::pair<const MachineOperand *, SrcStatus>
getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
int MaxDepth = 6) {
int Depth = 0;
- std::pair<const MachineOperand *, SrcStatus> Curr = {Op, SrcStatus::IS_SAME};
- std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = Curr;
+ std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = {
+ Op, SrcStatus::IS_SAME};
+ auto Curr = calcNextStatus(LastSameOrNeg, MRI);
- while (Depth <= MaxDepth && calcNextStatus(Curr, MRI)) {
+ while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (checkOptions(Curr.second)) {
- if (Curr.second == SrcStatus::IS_SAME ||
- Curr.second == SrcStatus::IS_HI_NEG ||
- Curr.second == SrcStatus::IS_LO_NEG ||
- Curr.second == SrcStatus::IS_BOTH_NEG)
- LastSameOrNeg = Curr;
+ if (checkOptions(Curr.value().second)) {
+ if (Curr.value().second == SrcStatus::IS_SAME ||
+ Curr.value().second == SrcStatus::IS_HI_NEG ||
+ Curr.value().second == SrcStatus::IS_LO_NEG ||
+ Curr.value().second == SrcStatus::IS_BOTH_NEG)
+ LastSameOrNeg = Curr.value();
}
+ Curr = calcNextStatus(Curr.value(), MRI);
}
return LastSameOrNeg;
}
-static bool isInlinableConstant(const MachineOperand &Op,
- const SIInstrInfo &TII) {
+static bool isInlinableFPConstant(const MachineOperand &Op,
+ const SIInstrInfo &TII) {
return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF());
}
@@ -4754,7 +4759,7 @@ static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
} else {
if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
(LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
- isInlinableConstant(*NewOp, TII)) {
+ isInlinableFPConstant(*NewOp, TII)) {
return true;
}
}
>From 3b7f377d17525647ab0583cfec56eb95514ca14e Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 26 Mar 2025 20:24:56 +0800
Subject: [PATCH 16/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 60 +++++++++----------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 28 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 18b8bdd54c612..ee99276be907e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4370,7 +4370,7 @@ static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
return false;
}
-std::optional<std::pair<const MachineOperand *, SrcStatus>>
+static std::optional<std::pair<const MachineOperand *, SrcStatus>>
retOpStat(const MachineOperand *Op, SrcStatus Stat,
std::pair<const MachineOperand *, SrcStatus> &Curr) {
if (Stat != SrcStatus::INVALID &&
@@ -4397,8 +4397,8 @@ static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op,
return TypeClass::NON_OF_LISTED;
}
-SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
- const MachineRegisterInfo &MRI) {
+static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
+ const MachineRegisterInfo &MRI) {
TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI);
if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
return SrcStatus::INVALID;
@@ -4544,7 +4544,7 @@ SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
llvm_unreachable("unexpected SrcStatus");
}
-std::optional<std::pair<const MachineOperand *, SrcStatus>>
+static std::optional<std::pair<const MachineOperand *, SrcStatus>>
calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr,
const MachineRegisterInfo &MRI) {
if (!Curr.first->isReg())
@@ -4632,26 +4632,26 @@ static bool checkOptions(SrcStatus Stat) {
return true;
}
-void setUpOptions(const MachineOperand *RootOp,
- const MachineRegisterInfo &MRI) {
+static void setUpOptions(const MachineOperand *RootOp,
+ const MachineRegisterInfo &MRI) {
const MachineInstr *MI = RootOp->getParent();
unsigned Opc = MI->getOpcode();
if (Opc < TargetOpcode::GENERIC_OP_END) {
// Keep same for gerneric op
- StatOptions.HasNeg = 1;
+ StatOptions.HasNeg = true;
} else if (Opc == TargetOpcode::G_INTRINSIC) {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
// Only float point intrinsic has neg & neg_hi bits
if (IntrinsicID == Intrinsic::amdgcn_fdot2)
- StatOptions.HasNeg = 1;
+ StatOptions.HasNeg = true;
else
- StatOptions.HasNeg = 0;
+ StatOptions.HasNeg = false;
} else
- StatOptions.HasNeg = 0;
+ StatOptions.HasNeg = false;
// Assume all complex pattern of VOP3P has opsel
- StatOptions.HasOpsel = 1;
+ StatOptions.HasOpsel = true;
}
static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
@@ -4715,7 +4715,7 @@ static bool isSameOperand(const MachineOperand *Op1,
return Op1->isIdenticalTo(*Op2);
}
-unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
+static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
// SrcStatus::IS_LOWER_HALF remain 0.
if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
Mods ^= SISrcMods::NEG_HI;
@@ -4732,7 +4732,7 @@ unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
Mods |= SISrcMods::OP_SEL_0;
} else if (LoStat == SrcStatus::IS_UPPER_HALF)
Mods |= SISrcMods::OP_SEL_0;
- else if (LoStat == SrcStatus::IS_UPPER_HALF_NEG)
+ else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
Mods |= SISrcMods::NEG;
else if (LoStat == SrcStatus::IS_HI_NEG)
Mods ^= SISrcMods::NEG;
@@ -4741,28 +4741,22 @@ unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
}
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
- unsigned int &Mods, const MachineOperand *NewOp,
+ const MachineOperand *NewOp,
const MachineOperand *RootOp, const SIInstrInfo &TII,
const MachineRegisterInfo &MRI) {
if (NewOp->isReg()) {
- if (isSameBitWidth(NewOp, RootOp, MRI) &&
- (HiStat == SrcStatus::IS_UPPER_HALF ||
- HiStat == SrcStatus::IS_UPPER_HALF_NEG ||
- HiStat == SrcStatus::IS_LOWER_HALF ||
- HiStat == SrcStatus::IS_LOWER_HALF_NEG) &&
- (LoStat == SrcStatus::IS_UPPER_HALF ||
- LoStat == SrcStatus::IS_UPPER_HALF_NEG ||
- LoStat == SrcStatus::IS_LOWER_HALF ||
- LoStat == SrcStatus::IS_LOWER_HALF_NEG)) {
- return true;
- }
- } else {
- if ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
- (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
- isInlinableFPConstant(*NewOp, TII)) {
- return true;
- }
- }
+ auto IsHalfState = [](SrcStatus S) {
+ return S == SrcStatus::IS_UPPER_HALF ||
+ S == SrcStatus::IS_UPPER_HALF_NEG ||
+ S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
+ };
+ return isSameBitWidth(NewOp, RootOp, MRI) && IsHalfState(LoStat) &&
+ IsHalfState(HiStat);
+ } else
+ return ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
+ (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
+ isInlinableFPConstant(*NewOp, TII));
+
return false;
}
@@ -4820,7 +4814,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
for (int i = StatlistHi.size() - 1; i >= 0; i--) {
for (int j = StatlistLo.size() - 1; j >= 0; j--) {
if (isSameOperand(StatlistHi[i].first, StatlistLo[j].first) &&
- isValidToPack(StatlistHi[i].second, StatlistLo[j].second, Mods,
+ isValidToPack(StatlistHi[i].second, StatlistLo[j].second,
StatlistHi[i].first, RootOp, TII, MRI))
return {StatlistHi[i].first,
updateMods(StatlistHi[i].second, StatlistLo[j].second, Mods)};
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From d7de92f5b2dd791e34596e4381105dcf88252228 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 26 Mar 2025 20:25:23 +0800
Subject: [PATCH 17/19] fix lit
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 45ed99410f46c748cd957297cda9c03d07311884 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 26 Mar 2025 20:45:31 +0800
Subject: [PATCH 18/19] avoid global variable
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 81 +++++++++----------
1 file changed, 40 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ee99276be907e..d619be62194e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4615,55 +4615,53 @@ calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr,
return std::nullopt;
}
-struct {
+class statOptions {
+private:
bool HasNeg;
bool HasOpsel;
-} StatOptions;
-static bool checkOptions(SrcStatus Stat) {
- if (!StatOptions.HasNeg &&
- (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) {
- return false;
+public:
+ statOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) {
+ const MachineInstr *MI = RootOp->getParent();
+ unsigned Opc = MI->getOpcode();
+ HasNeg = false;
+ HasOpsel = false;
+ if (Opc < TargetOpcode::GENERIC_OP_END) {
+ // Keep same for gerneric op
+ HasNeg = true;
+ } else if (Opc == TargetOpcode::G_INTRINSIC) {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
+ // Only float point intrinsic has neg & neg_hi bits
+ if (IntrinsicID == Intrinsic::amdgcn_fdot2)
+ HasNeg = true;
+ }
+
+ // Assume all complex pattern of VOP3P has opsel
+ HasOpsel = true;
}
- if (!StatOptions.HasOpsel &&
- (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) {
- return false;
+ bool checkOptions(SrcStatus Stat) const {
+ if (!HasNeg &&
+ (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) {
+ return false;
+ }
+ if (!HasOpsel &&
+ (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) {
+ return false;
+ }
+ return true;
}
- return true;
-}
-
-static void setUpOptions(const MachineOperand *RootOp,
- const MachineRegisterInfo &MRI) {
- const MachineInstr *MI = RootOp->getParent();
- unsigned Opc = MI->getOpcode();
-
- if (Opc < TargetOpcode::GENERIC_OP_END) {
- // Keep same for gerneric op
- StatOptions.HasNeg = true;
- } else if (Opc == TargetOpcode::G_INTRINSIC) {
- Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
- // Only float point intrinsic has neg & neg_hi bits
- if (IntrinsicID == Intrinsic::amdgcn_fdot2)
- StatOptions.HasNeg = true;
- else
- StatOptions.HasNeg = false;
- } else
- StatOptions.HasNeg = false;
-
- // Assume all complex pattern of VOP3P has opsel
- StatOptions.HasOpsel = true;
-}
+};
static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- int MaxDepth = 6) {
+ statOptions StatOptions, int MaxDepth = 6) {
int Depth = 0;
auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI);
SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (checkOptions(Curr.value().second)) {
+ if (StatOptions.checkOptions(Curr.value().second)) {
Statlist.push_back(Curr.value());
}
Curr = calcNextStatus(Curr.value(), MRI);
@@ -4674,7 +4672,7 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
static std::pair<const MachineOperand *, SrcStatus>
getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- int MaxDepth = 6) {
+ statOptions StatOptions, int MaxDepth = 6) {
int Depth = 0;
std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = {
Op, SrcStatus::IS_SAME};
@@ -4682,7 +4680,7 @@ getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (checkOptions(Curr.value().second)) {
+ if (StatOptions.checkOptions(Curr.value().second)) {
if (Curr.value().second == SrcStatus::IS_SAME ||
Curr.value().second == SrcStatus::IS_HI_NEG ||
Curr.value().second == SrcStatus::IS_LO_NEG ||
@@ -4772,9 +4770,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
return {Op, Mods};
}
- setUpOptions(Op, MRI);
+ statOptions StatOptions(Op, MRI);
- std::pair<const MachineOperand *, SrcStatus> Stat = getLastSameOrNeg(Op, MRI);
+ std::pair<const MachineOperand *, SrcStatus> Stat =
+ getLastSameOrNeg(Op, MRI, StatOptions);
if (!Stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
@@ -4796,7 +4795,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
}
SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi =
- getSrcStats(&MI->getOperand(2), MRI);
+ getSrcStats(&MI->getOperand(2), MRI, StatOptions);
if (StatlistHi.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
@@ -4804,7 +4803,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
}
SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo =
- getSrcStats(&MI->getOperand(1), MRI);
+ getSrcStats(&MI->getOperand(1), MRI, StatOptions);
if (StatlistLo.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
>From a792c1d91bf2e5e88cda8321d674112aea1652af Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Fri, 28 Mar 2025 11:28:04 +0800
Subject: [PATCH 19/19] fix comments
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 90 +++++++++----------
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +
2 files changed, 47 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d619be62194e6..d46f74cb5004d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4319,8 +4319,11 @@ enum class SrcStatus {
IS_UPPER_HALF,
IS_LOWER_HALF,
IS_UPPER_HALF_NEG,
+ // This means current op = [op_upper, op_lower] and src = -op_lower
IS_LOWER_HALF_NEG,
IS_HI_NEG,
+ // This means current op = [op_upper, op_lower] and src = [op_upper,
+ // -op_lower]
IS_LO_NEG,
IS_BOTH_NEG,
INVALID,
@@ -4383,18 +4386,18 @@ retOpStat(const MachineOperand *Op, SrcStatus Stat,
return std::nullopt;
}
-enum class TypeClass { VECTOR_OF_TWO, SCALAR, NON_OF_LISTED };
+enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op,
const MachineRegisterInfo &MRI) {
if (!Op->isReg() || Op->getReg().isPhysical())
- return TypeClass::NON_OF_LISTED;
+ return TypeClass::NONE_OF_LISTED;
LLT OpTy = MRI.getType(Op->getReg());
if (OpTy.isScalar())
return TypeClass::SCALAR;
if (OpTy.isVector() && OpTy.getNumElements() == 2)
return TypeClass::VECTOR_OF_TWO;
- return TypeClass::NON_OF_LISTED;
+ return TypeClass::NONE_OF_LISTED;
}
static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
@@ -4615,19 +4618,19 @@ calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr,
return std::nullopt;
}
-class statOptions {
+class searchOptions {
private:
- bool HasNeg;
- bool HasOpsel;
+ bool HasNeg = false;
+ // Assume all complex pattern of VOP3P has opsel
+ bool HasOpsel = true;
public:
- statOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) {
+ searchOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) {
const MachineInstr *MI = RootOp->getParent();
unsigned Opc = MI->getOpcode();
- HasNeg = false;
- HasOpsel = false;
+
if (Opc < TargetOpcode::GENERIC_OP_END) {
- // Keep same for gerneric op
+ // Keep same for generic op
HasNeg = true;
} else if (Opc == TargetOpcode::G_INTRINSIC) {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
@@ -4635,9 +4638,6 @@ class statOptions {
if (IntrinsicID == Intrinsic::amdgcn_fdot2)
HasNeg = true;
}
-
- // Assume all complex pattern of VOP3P has opsel
- HasOpsel = true;
}
bool checkOptions(SrcStatus Stat) const {
if (!HasNeg &&
@@ -4654,14 +4654,14 @@ class statOptions {
static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- statOptions StatOptions, int MaxDepth = 6) {
+ searchOptions SearchOptions, int MaxDepth = 6) {
int Depth = 0;
auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI);
SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (StatOptions.checkOptions(Curr.value().second)) {
+ if (SearchOptions.checkOptions(Curr.value().second)) {
Statlist.push_back(Curr.value());
}
Curr = calcNextStatus(Curr.value(), MRI);
@@ -4672,7 +4672,7 @@ getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
static std::pair<const MachineOperand *, SrcStatus>
getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- statOptions StatOptions, int MaxDepth = 6) {
+ searchOptions SearchOptions, int MaxDepth = 6) {
int Depth = 0;
std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = {
Op, SrcStatus::IS_SAME};
@@ -4680,7 +4680,7 @@ getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
while (Depth <= MaxDepth && Curr.has_value()) {
Depth++;
- if (StatOptions.checkOptions(Curr.value().second)) {
+ if (SearchOptions.checkOptions(Curr.value().second)) {
if (Curr.value().second == SrcStatus::IS_SAME ||
Curr.value().second == SrcStatus::IS_HI_NEG ||
Curr.value().second == SrcStatus::IS_LO_NEG ||
@@ -4770,10 +4770,10 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
return {Op, Mods};
}
- statOptions StatOptions(Op, MRI);
+ searchOptions SearchOptions(Op, MRI);
std::pair<const MachineOperand *, SrcStatus> Stat =
- getLastSameOrNeg(Op, MRI, StatOptions);
+ getLastSameOrNeg(Op, MRI, SearchOptions);
if (!Stat.first->isReg()) {
Mods |= SISrcMods::OP_SEL_1;
return {Op, Mods};
@@ -4795,7 +4795,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
}
SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi =
- getSrcStats(&MI->getOperand(2), MRI, StatOptions);
+ getSrcStats(&MI->getOperand(2), MRI, SearchOptions);
if (StatlistHi.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
@@ -4803,7 +4803,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
}
SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo =
- getSrcStats(&MI->getOperand(1), MRI, StatOptions);
+ getSrcStats(&MI->getOperand(1), MRI, SearchOptions);
if (StatlistLo.size() == 0) {
Mods |= SISrcMods::OP_SEL_1;
@@ -4846,10 +4846,17 @@ static bool checkRB(const MachineOperand *Op, unsigned int RBNo,
return RB->getID() == RBNo;
}
-const MachineOperand *
-getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
- const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
+// This function is used to get the correct register bank for returned reg
+// Assume:
+// 1. VOP3P is always legal for VGPR
+// 2. RootOp's regbank is legal
+// Thus
+// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR
+// 2. If RootOp is VGPR, then NewOp must be VGPR
+static const MachineOperand *
+getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp,
+ const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
// RootOp can only be VGPR or SGPR (some hand written cases such as
// inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
@@ -4877,18 +4884,18 @@ getVReg(const MachineOperand *NewOp, const MachineOperand *RootOp,
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
-
- auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI);
+AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
+ bool IsDOT) const {
+ MachineRegisterInfo &MRI =
+ Root.getParent()->getParent()->getParent()->getRegInfo();
+ auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, IsDOT);
if (!(Op->isReg()))
return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
- Op = getVReg(Op, &Root, RBI, MRI, TRI, TII);
+ Op = getLegalRegBank(Op, &Root, RBI, MRI, TRI, TII);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
@@ -4896,22 +4903,15 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
- auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, true);
- if (!(Op->isReg()))
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
+ return selectVOP3PRetHelper(Root);
+}
- Op = getVReg(Op, &Root, RBI, MRI, TRI, TII);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+
+ return selectVOP3PRetHelper(Root, true);
}
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index dd172edfdf216..d77fab99d7251 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -190,6 +190,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
std::pair<const MachineOperand *, unsigned>
selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand &Root) const;
More information about the llvm-commits
mailing list