[llvm-branch-commits] [llvm] 9f06475 - Revert "[AMDGPU] Implement vop3p complex pattern optmization for gisel (#130234)"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Apr 17 20:13:56 PDT 2025
Author: Shoreshen
Date: 2025-04-18T11:13:53+08:00
New Revision: 9f0647536e893f7ba104dda2ab8895abd6a83c0c
URL: https://github.com/llvm/llvm-project/commit/9f0647536e893f7ba104dda2ab8895abd6a83c0c
DIFF: https://github.com/llvm/llvm-project/commit/9f0647536e893f7ba104dda2ab8895abd6a83c0c.diff
LOG: Revert "[AMDGPU] Implement vop3p complex pattern optmization for gisel (#130234)"
This reverts commit a04580f71b98bdb12100da66c9975e9a1001b4d6.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 87af467ac8f1e..6ef7505ec6f62 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4319,598 +4319,60 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-enum class SrcStatus {
- IS_SAME,
- IS_UPPER_HALF,
- IS_LOWER_HALF,
- IS_UPPER_HALF_NEG,
- // This means current op = [op_upper, op_lower] and src = -op_lower.
- IS_LOWER_HALF_NEG,
- IS_HI_NEG,
- // This means current op = [op_upper, op_lower] and src = [op_upper,
- // -op_lower].
- IS_LO_NEG,
- IS_BOTH_NEG,
- INVALID,
- NEG_START = IS_UPPER_HALF_NEG,
- NEG_END = IS_BOTH_NEG,
- HALF_START = IS_UPPER_HALF,
- HALF_END = IS_LOWER_HALF_NEG
-};
-
-static bool isTruncHalf(const MachineInstr *MI,
- const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_TRUNC)
- return false;
-
- unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
- unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- return DstSize * 2 == SrcSize;
-}
-
-static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_LSHR)
- return false;
-
- Register ShiftSrc;
- std::optional<ValueAndVReg> ShiftAmt;
- if (mi_match(MI->getOperand(0).getReg(), MRI,
- m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
- unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- unsigned Shift = ShiftAmt->Value.getZExtValue();
- return Shift * 2 == SrcSize;
- }
- return false;
-}
-
-static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
- if (MI->getOpcode() != AMDGPU::G_SHL)
- return false;
-
- Register ShiftSrc;
- std::optional<ValueAndVReg> ShiftAmt;
- if (mi_match(MI->getOperand(0).getReg(), MRI,
- m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
- unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
- unsigned Shift = ShiftAmt->Value.getZExtValue();
- return Shift * 2 == SrcSize;
- }
- return false;
-}
-
-static std::optional<std::pair<const MachineOperand *, SrcStatus>>
-retOpStat(const MachineOperand *Op, SrcStatus Stat,
- std::pair<const MachineOperand *, SrcStatus> &Curr) {
- if (Stat != SrcStatus::INVALID &&
- ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
- Op->isCImm() || Op->isFPImm())) {
- return std::optional<std::pair<const MachineOperand *, SrcStatus>>(
- {Op, Stat});
- }
-
- return std::nullopt;
-}
-
-enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
-
-static TypeClass isVectorOfTwoOrScalar(const MachineOperand *Op,
- const MachineRegisterInfo &MRI) {
- if (!Op->isReg() || Op->getReg().isPhysical())
- return TypeClass::NONE_OF_LISTED;
- LLT OpTy = MRI.getType(Op->getReg());
- if (OpTy.isScalar())
- return TypeClass::SCALAR;
- if (OpTy.isVector() && OpTy.getNumElements() == 2)
- return TypeClass::VECTOR_OF_TWO;
- return TypeClass::NONE_OF_LISTED;
-}
-
-static SrcStatus getNegStatus(const MachineOperand *Op, SrcStatus S,
- const MachineRegisterInfo &MRI) {
- TypeClass NegType = isVectorOfTwoOrScalar(Op, MRI);
- if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
- return SrcStatus::INVALID;
-
- switch (S) {
- case SrcStatus::IS_SAME:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // [SrcHi, SrcLo] = [CurrHi, CurrLo]
- // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
- // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
- // [SrcHi, SrcLo] = [-OpHi, -OpLo]
- return SrcStatus::IS_BOTH_NEG;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // [SrcHi, SrcLo] = [CurrHi, CurrLo]
- // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
- // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
- // [SrcHi, SrcLo] = [-OpHi, OpLo]
- return SrcStatus::IS_HI_NEG;
- }
- break;
- case SrcStatus::IS_HI_NEG:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
- // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
- // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
- // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
- return SrcStatus::IS_LO_NEG;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
- // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
- // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
- // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
- return SrcStatus::IS_SAME;
- }
- break;
- case SrcStatus::IS_LO_NEG:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
- // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
- // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
- // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
- return SrcStatus::IS_HI_NEG;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
- // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
- // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
- // [SrcHi, SrcLo] = [-OpHi, -OpLo]
- return SrcStatus::IS_BOTH_NEG;
- }
- break;
- case SrcStatus::IS_BOTH_NEG:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
- // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
- // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
- // [SrcHi, SrcLo] = [OpHi, OpLo]
- return SrcStatus::IS_SAME;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
- // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
- // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
- // [SrcHi, SrcLo] = [OpHi, -OpLo]
- return SrcStatus::IS_LO_NEG;
- }
- break;
- case SrcStatus::IS_UPPER_HALF:
- // Vector of 2:
- // Src = CurrUpper
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
- // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
- // Src = -OpUpper
- //
- // Scalar:
- // Src = CurrUpper
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
- // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
- // Src = -OpUpper
- return SrcStatus::IS_UPPER_HALF_NEG;
- case SrcStatus::IS_LOWER_HALF:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // Src = CurrLower
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
- // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
- // Src = -OpLower
- return SrcStatus::IS_LOWER_HALF_NEG;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // Src = CurrLower
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
- // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
- // Src = OpLower
- return SrcStatus::IS_LOWER_HALF;
- }
- break;
- case SrcStatus::IS_UPPER_HALF_NEG:
- // Vector of 2:
- // Src = -CurrUpper
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
- // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
- // Src = -(-OpUpper) = OpUpper
- //
- // Scalar:
- // Src = -CurrUpper
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
- // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
- // Src = -(-OpUpper) = OpUpper
- return SrcStatus::IS_UPPER_HALF;
- case SrcStatus::IS_LOWER_HALF_NEG:
- if (NegType == TypeClass::VECTOR_OF_TWO) {
- // Vector of 2:
- // Src = -CurrLower
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
- // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
- // Src = -(-OpLower) = OpLower
- return SrcStatus::IS_LOWER_HALF;
- } else if (NegType == TypeClass::SCALAR) {
- // Scalar:
- // Src = -CurrLower
- // Curr = [CurrUpper, CurrLower]
- // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
- // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
- // Src = -OpLower
- return SrcStatus::IS_LOWER_HALF_NEG;
- }
- break;
- default:
- llvm_unreachable("unexpected SrcStatus");
- }
-}
-
-static std::optional<std::pair<const MachineOperand *, SrcStatus>>
-calcNextStatus(std::pair<const MachineOperand *, SrcStatus> Curr,
- const MachineRegisterInfo &MRI) {
- if (!Curr.first->isReg())
- return std::nullopt;
-
- const MachineInstr *MI = Curr.first->isDef()
- ? Curr.first->getParent()
- : MRI.getVRegDef(Curr.first->getReg());
-
- unsigned Opc = MI->getOpcode();
-
- // Handle general Opc cases.
- switch (Opc) {
- case AMDGPU::G_BITCAST:
- case AMDGPU::G_CONSTANT:
- case AMDGPU::G_FCONSTANT:
- case AMDGPU::COPY:
- return retOpStat(&MI->getOperand(1), Curr.second, Curr);
- case AMDGPU::G_FNEG:
- return retOpStat(&MI->getOperand(1),
- getNegStatus(Curr.first, Curr.second, MRI), Curr);
- default:
- break;
- }
-
- // Calc next Stat from current Stat.
- switch (Curr.second) {
- case SrcStatus::IS_SAME:
- if (isTruncHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr);
- break;
- case SrcStatus::IS_HI_NEG:
- if (isTruncHalf(MI, MRI)) {
- // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
- // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
- // = [OpLowerHi, OpLowerLo]
- // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
- // = [-OpLowerHi, OpLowerLo]
- // = -OpLower
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr);
- }
- break;
- case SrcStatus::IS_UPPER_HALF:
- if (isShlHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF, Curr);
- break;
- case SrcStatus::IS_LOWER_HALF:
- if (isLshrHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF, Curr);
- break;
- case SrcStatus::IS_UPPER_HALF_NEG:
- if (isShlHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_LOWER_HALF_NEG, Curr);
- break;
- case SrcStatus::IS_LOWER_HALF_NEG:
- if (isLshrHalf(MI, MRI))
- return retOpStat(&MI->getOperand(1), SrcStatus::IS_UPPER_HALF_NEG, Curr);
- break;
- default:
- break;
- }
- return std::nullopt;
-}
-
-class searchOptions {
-private:
- bool HasNeg = false;
- // Assume all complex pattern of VOP3P has opsel.
- bool HasOpsel = true;
-
-public:
- searchOptions(const MachineOperand *RootOp, const MachineRegisterInfo &MRI) {
- const MachineInstr *MI = RootOp->getParent();
- unsigned Opc = MI->getOpcode();
-
- if (Opc < TargetOpcode::GENERIC_OP_END) {
- // Keep same for generic op.
- HasNeg = true;
- } else if (Opc == TargetOpcode::G_INTRINSIC) {
- Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
- // Only float point intrinsic has neg & neg_hi bits.
- if (IntrinsicID == Intrinsic::amdgcn_fdot2)
- HasNeg = true;
- }
- }
- bool checkOptions(SrcStatus Stat) const {
- if (!HasNeg &&
- (Stat >= SrcStatus::NEG_START || Stat <= SrcStatus::NEG_END)) {
- return false;
- }
- if (!HasOpsel &&
- (Stat >= SrcStatus::HALF_START || Stat >= SrcStatus::HALF_END)) {
- return false;
- }
- return true;
- }
-};
-
-static SmallVector<std::pair<const MachineOperand *, SrcStatus>>
-getSrcStats(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- searchOptions SearchOptions, int MaxDepth = 6) {
- int Depth = 0;
- auto Curr = calcNextStatus({Op, SrcStatus::IS_SAME}, MRI);
- SmallVector<std::pair<const MachineOperand *, SrcStatus>, 4> Statlist;
-
- while (Depth <= MaxDepth && Curr.has_value()) {
- Depth++;
- if (SearchOptions.checkOptions(Curr.value().second))
- Statlist.push_back(Curr.value());
- Curr = calcNextStatus(Curr.value(), MRI);
- }
-
- return Statlist;
-}
-
-static std::pair<const MachineOperand *, SrcStatus>
-getLastSameOrNeg(const MachineOperand *Op, const MachineRegisterInfo &MRI,
- searchOptions SearchOptions, int MaxDepth = 6) {
- int Depth = 0;
- std::pair<const MachineOperand *, SrcStatus> LastSameOrNeg = {
- Op, SrcStatus::IS_SAME};
- auto Curr = calcNextStatus(LastSameOrNeg, MRI);
-
- while (Depth <= MaxDepth && Curr.has_value()) {
- Depth++;
- if (SearchOptions.checkOptions(Curr.value().second)) {
- if (Curr.value().second == SrcStatus::IS_SAME ||
- Curr.value().second == SrcStatus::IS_HI_NEG ||
- Curr.value().second == SrcStatus::IS_LO_NEG ||
- Curr.value().second == SrcStatus::IS_BOTH_NEG)
- LastSameOrNeg = Curr.value();
- }
- Curr = calcNextStatus(Curr.value(), MRI);
- }
-
- return LastSameOrNeg;
-}
-
-static bool isInlinableFPConstant(const MachineOperand &Op,
- const SIInstrInfo &TII) {
- return Op.isFPImm() && TII.isInlineConstant(Op.getFPImm()->getValueAPF());
-}
-
-static bool isSameBitWidth(const MachineOperand *Op1, const MachineOperand *Op2,
- const MachineRegisterInfo &MRI) {
- unsigned Width1 = MRI.getType(Op1->getReg()).getSizeInBits();
- unsigned Width2 = MRI.getType(Op2->getReg()).getSizeInBits();
- return Width1 == Width2;
-}
-
-static bool isSameOperand(const MachineOperand *Op1,
- const MachineOperand *Op2) {
- if (Op1->isReg())
- return Op2->isReg() && Op1->getReg() == Op2->getReg();
-
- return Op1->isIdenticalTo(*Op2);
-}
-
-static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
- // SrcStatus::IS_LOWER_HALF remain 0.
- if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
- Mods ^= SISrcMods::NEG_HI;
- Mods |= SISrcMods::OP_SEL_1;
- } else if (HiStat == SrcStatus::IS_UPPER_HALF)
- Mods |= SISrcMods::OP_SEL_1;
- else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
- Mods ^= SISrcMods::NEG_HI;
- else if (HiStat == SrcStatus::IS_HI_NEG)
- Mods ^= SISrcMods::NEG_HI;
-
- if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
- Mods ^= SISrcMods::NEG;
- Mods |= SISrcMods::OP_SEL_0;
- } else if (LoStat == SrcStatus::IS_UPPER_HALF)
- Mods |= SISrcMods::OP_SEL_0;
- else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
- Mods |= SISrcMods::NEG;
- else if (LoStat == SrcStatus::IS_HI_NEG)
- Mods ^= SISrcMods::NEG;
-
- return Mods;
-}
-
-static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat,
- const MachineOperand *NewOp,
- const MachineOperand *RootOp, const SIInstrInfo &TII,
- const MachineRegisterInfo &MRI) {
- if (NewOp->isReg()) {
- auto IsHalfState = [](SrcStatus S) {
- return S == SrcStatus::IS_UPPER_HALF ||
- S == SrcStatus::IS_UPPER_HALF_NEG ||
- S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
- };
- return isSameBitWidth(NewOp, RootOp, MRI) && IsHalfState(LoStat) &&
- IsHalfState(HiStat);
- } else
- return ((HiStat == SrcStatus::IS_SAME || HiStat == SrcStatus::IS_HI_NEG) &&
- (LoStat == SrcStatus::IS_SAME || LoStat == SrcStatus::IS_HI_NEG) &&
- isInlinableFPConstant(*NewOp, TII));
-
- return false;
-}
-
-std::pair<const MachineOperand *, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(const MachineOperand *RootOp,
- const MachineRegisterInfo &MRI,
- bool IsDOT) const {
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(
+ Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
unsigned Mods = 0;
- const MachineOperand *Op = RootOp;
- // No modification if Root type is not form of <2 x Type>.
- if (isVectorOfTwoOrScalar(Op, MRI) != TypeClass::VECTOR_OF_TWO) {
- Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
- }
-
- searchOptions SearchOptions(Op, MRI);
+ MachineInstr *MI = MRI.getVRegDef(Src);
- std::pair<const MachineOperand *, SrcStatus> Stat =
- getLastSameOrNeg(Op, MRI, SearchOptions);
- if (!Stat.first->isReg()) {
- Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
- }
- if (Stat.second == SrcStatus::IS_BOTH_NEG)
+ if (MI->getOpcode() == AMDGPU::G_FNEG &&
+ // It's possible to see an f32 fneg here, but unlikely.
+ // TODO: Treat f32 fneg as only high bit.
+ MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- else if (Stat.second == SrcStatus::IS_HI_NEG)
- Mods ^= SISrcMods::NEG_HI;
- else if (Stat.second == SrcStatus::IS_LO_NEG)
- Mods ^= SISrcMods::NEG;
-
- Op = Stat.first;
- MachineInstr *MI = MRI.getVRegDef(Op->getReg());
-
- if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
- (IsDOT && Subtarget->hasDOTOpSelHazard())) {
- Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
- }
-
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistHi =
- getSrcStats(&MI->getOperand(2), MRI, SearchOptions);
-
- if (StatlistHi.size() == 0) {
- Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
+ Src = MI->getOperand(1).getReg();
+ MI = MRI.getVRegDef(Src);
}
- SmallVector<std::pair<const MachineOperand *, SrcStatus>> StatlistLo =
- getSrcStats(&MI->getOperand(1), MRI, SearchOptions);
+ // TODO: Handle G_FSUB 0 as fneg
- if (StatlistLo.size() == 0) {
- Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
- }
+ // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+ (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
- for (int I = StatlistHi.size() - 1; I >= 0; I--) {
- for (int J = StatlistLo.size() - 1; J >= 0; J--) {
- if (isSameOperand(StatlistHi[I].first, StatlistLo[J].first) &&
- isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
- StatlistHi[I].first, RootOp, TII, MRI))
- return {StatlistHi[I].first,
- updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
- }
- }
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
- return {Op, Mods};
-}
-
-int64_t getAllKindImm(const MachineOperand *Op) {
- switch (Op->getType()) {
- case MachineOperand::MachineOperandType::MO_Immediate:
- return Op->getImm();
- case MachineOperand::MachineOperandType::MO_CImmediate:
- return Op->getCImm()->getSExtValue();
- case MachineOperand::MachineOperandType::MO_FPImmediate:
- return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
- default:
- llvm_unreachable("not an imm type");
- }
-}
-
-static bool checkRB(const MachineOperand *Op, unsigned int RBNo,
- const AMDGPURegisterBankInfo &RBI,
- const MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI) {
- const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI);
- return RB->getID() == RBNo;
-}
-
-// This function is used to get the correct register bank for returned reg.
-// Assume:
-// 1. VOP3P is always legal for VGPR.
-// 2. RootOp's regbank is legal.
-// Thus
-// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
-// 2. If RootOp is VGPR, then NewOp must be VGPR.
-static const MachineOperand *
-getLegalRegBank(const MachineOperand *NewOp, const MachineOperand *RootOp,
- const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI,
- const TargetRegisterInfo &TRI, const SIInstrInfo &TII) {
- // RootOp can only be VGPR or SGPR (some hand written cases such as.
- // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
- if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
- checkRB(NewOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
- return NewOp;
-
- MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
- if (MI->getOpcode() == AMDGPU::COPY &&
- isSameOperand(NewOp, &MI->getOperand(1))) {
- // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
- return RootOp;
- }
-
- MachineBasicBlock *BB = MI->getParent();
- const TargetRegisterClass *DstRC =
- TRI.getConstrainedRegClassForOperand(*RootOp, MRI);
- Register DstReg = MRI.createVirtualRegister(DstRC);
-
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
- .addReg(NewOp->getReg());
-
- // only accept VGPR.
- return &MIB->getOperand(0);
+ return std::pair(Src, Mods);
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
- bool IsDOT) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
- auto [Op, Mods] = selectVOP3PModsImpl(&Root, MRI, IsDOT);
- if (!(Op->isReg()))
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(Op)); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
- Op = getLegalRegBank(Op, &Root, RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Op->getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
-
- return selectVOP3PRetHelper(Root);
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
- return selectVOP3PRetHelper(Root, true);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
}
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a224f39f88996..6c3f3026e877a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -187,11 +187,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
- std::pair<const MachineOperand *, unsigned>
- selectVOP3PModsImpl(const MachineOperand *Op, const MachineRegisterInfo &MRI,
+ std::pair<Register, unsigned>
+ selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
- InstructionSelector::ComplexRendererFns
- selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand &Root) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 534b454775502..543f8e413abd8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -106,169 +106,6 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b)
ret <2 x half> %mul
}
-define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) {
-; GFX9-LABEL: v_fmul_v2f16_partial_neg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
-; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmul_v2f16_partial_neg:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1
-; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0
-; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3
-; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fmul_v2f16_partial_neg:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
-; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %b1 = bitcast <2 x half> %b to float
- %b2 = fneg float %b1
- %b3 = bitcast float %b2 to <2 x half>
- %b4 = fneg <2 x half> %b3
- %mul1 = fmul <2 x half> %b3, %a
- %mul2 = fmul <2 x half> %b4, %mul1
- ret <2 x half> %mul2
-}
-
-define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-LABEL: fmul_v2_half_neg_hi:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: fmul_v2_half_neg_hi:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: fmul_v2_half_neg_hi:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %b1 = bitcast <2 x half> %b to float
- %b2 = fneg float %b1
- %b3 = bitcast float %b2 to <2 x half>
- %b4 = extractelement <2 x half> %b3, i64 1
- %tmp = insertelement <2 x half> poison, half %b4, i64 0
- %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
- %mul = fmul <2 x half> %a, %k
- ret <2 x half> %mul
-}
-
-define <2 x half> @fmul_v2_half_neg_hi1(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-LABEL: fmul_v2_half_neg_hi1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: fmul_v2_half_neg_hi1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: fmul_v2_half_neg_hi1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %b1 = bitcast <2 x half> %b to float
- %b2 = fneg float %b1
- %b3 = bitcast float %b2 to <2 x half>
- %b4 = fneg <2 x half> %b3
- %b5 = extractelement <2 x half> %b4, i64 1
- %tmp = insertelement <2 x half> poison, half %b5, i64 0
- %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
- %mul = fmul <2 x half> %a, %k
- ret <2 x half> %mul
-}
-
-define <2 x half> @fmul_v2_half_neg_lo(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-LABEL: fmul_v2_half_neg_lo:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: fmul_v2_half_neg_lo:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: fmul_v2_half_neg_lo:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] neg_hi:[0,1]
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %b1 = bitcast <2 x half> %b to float
- %b2 = fneg float %b1
- %b3 = bitcast float %b2 to <2 x half>
- %b4 = fneg <2 x half> %b3
- %b5 = extractelement <2 x half> %b4, i64 0
- %tmp = insertelement <2 x half> poison, half %b5, i64 0
- %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
- %mul = fmul <2 x half> %a, %k
- ret <2 x half> %mul
-}
-
-define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-LABEL: fmul_v2_half_neg_lo1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: fmul_v2_half_neg_lo1:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: fmul_v2_half_neg_lo1:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
-; GFX10-NEXT: s_setpc_b64 s[30:31]
- %b1 = bitcast <2 x half> %b to float
- %b2 = fneg float %b1
- %b3 = bitcast float %b2 to <2 x half>
- %b4 = extractelement <2 x half> %b3, i64 0
- %tmp = insertelement <2 x half> poison, half %b4, i64 0
- %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
- %mul = fmul <2 x half> %a, %k
- ret <2 x half> %mul
-}
-
define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
; GFX9-LABEL: v_fmul_v3f16:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index 8f0ae8c47098a..744a5b7feb48d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -304,7 +304,8 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_a:
@@ -318,7 +319,8 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -329,7 +331,8 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
+; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_b:
@@ -343,7 +346,8 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 287a009ca1405..9e623494a5a04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -289,19 +289,22 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
+; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -312,19 +315,22 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
+; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
+; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 141b86a24c1c4..2f1dfa11fd34d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -742,8 +742,9 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0
; GFX9-GISEL-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -783,7 +784,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
; GFX10-GISEL-NEXT: ; return to shader part epilog
;
; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -808,7 +810,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
; GFX11-GISEL-NEXT: ; return to shader part epilog
; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-SDAG: ; %bb.0:
@@ -821,7 +824,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog
; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-GISEL: ; %bb.0:
-; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
+; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x half> %val
More information about the llvm-branch-commits
mailing list