[llvm-branch-commits] [llvm] 14a4448 - Revert "Revert "[AMDGPU] Re-apply: Implement vop3p complex pattern optmizatio…"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jul 3 18:43:54 PDT 2025
Author: Shoreshen
Date: 2025-07-04T09:43:51+08:00
New Revision: 14a4448afc8e65b8610d78fc66f2695a691a25b3
URL: https://github.com/llvm/llvm-project/commit/14a4448afc8e65b8610d78fc66f2695a691a25b3
DIFF: https://github.com/llvm/llvm-project/commit/14a4448afc8e65b8610d78fc66f2695a691a25b3.diff
LOG: Revert "Revert "[AMDGPU] Re-apply: Implement vop3p complex pattern optmizatio…"
This reverts commit 5b8304d6b90c42f2d3cf918e5e0f935767866e64.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
llvm/test/CodeGen/AMDGPU/packed-fp32.ll
llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b632b16f5c198..fd679a9933cf0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4327,60 +4327,591 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(
- Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
+enum class SrcStatus {
+ IS_SAME,
+ IS_UPPER_HALF,
+ IS_LOWER_HALF,
+ IS_UPPER_HALF_NEG,
+ // This means current op = [op_upper, op_lower] and src = -op_lower.
+ IS_LOWER_HALF_NEG,
+ IS_HI_NEG,
+ // This means current op = [op_upper, op_lower] and src = [op_upper,
+ // -op_lower].
+ IS_LO_NEG,
+ IS_BOTH_NEG,
+ INVALID,
+ NEG_START = IS_UPPER_HALF_NEG,
+ NEG_END = IS_BOTH_NEG,
+ HALF_START = IS_UPPER_HALF,
+ HALF_END = IS_LOWER_HALF_NEG
+};
+/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
+static bool isTruncHalf(const MachineInstr *MI,
+ const MachineRegisterInfo &MRI) {
+ if (MI->getOpcode() != AMDGPU::G_TRUNC)
+ return false;
+
+ unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ return DstSize * 2 == SrcSize;
+}
+
+/// Test if the MI is logic shift right with half bits,
+/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
+static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ if (MI->getOpcode() != AMDGPU::G_LSHR)
+ return false;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned Shift = ShiftAmt->Value.getZExtValue();
+ return Shift * 2 == SrcSize;
+ }
+ return false;
+}
+
+/// Test if the MI is shift left with half bits,
+/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
+static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ if (MI->getOpcode() != AMDGPU::G_SHL)
+ return false;
+
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned Shift = ShiftAmt->Value.getZExtValue();
+ return Shift * 2 == SrcSize;
+ }
+ return false;
+}
+
+/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
+static bool isUnmergeHalf(const MachineInstr *MI,
+ const MachineRegisterInfo &MRI) {
+ if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
+ return false;
+ return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
+ MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
+}
+
+enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED };
+
+static TypeClass isVectorOfTwoOrScalar(Register Reg,
+ const MachineRegisterInfo &MRI) {
+ LLT OpTy = MRI.getType(Reg);
+ if (OpTy.isScalar())
+ return TypeClass::SCALAR;
+ if (OpTy.isVector() && OpTy.getNumElements() == 2)
+ return TypeClass::VECTOR_OF_TWO;
+ return TypeClass::NONE_OF_LISTED;
+}
+
+static SrcStatus getNegStatus(Register Reg, SrcStatus S,
+ const MachineRegisterInfo &MRI) {
+ TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
+ if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
+ return SrcStatus::INVALID;
+
+ switch (S) {
+ case SrcStatus::IS_SAME:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-OpHi, -OpLo]
+ return SrcStatus::IS_BOTH_NEG;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-OpHi, OpLo]
+ return SrcStatus::IS_HI_NEG;
+ }
+ break;
+ case SrcStatus::IS_HI_NEG:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
+ return SrcStatus::IS_LO_NEG;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
+ return SrcStatus::IS_SAME;
+ }
+ break;
+ case SrcStatus::IS_LO_NEG:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
+ return SrcStatus::IS_HI_NEG;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [-OpHi, -OpLo]
+ return SrcStatus::IS_BOTH_NEG;
+ }
+ break;
+ case SrcStatus::IS_BOTH_NEG:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
+ // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
+ // [SrcHi, SrcLo] = [OpHi, OpLo]
+ return SrcStatus::IS_SAME;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
+ // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
+ // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
+ // [SrcHi, SrcLo] = [OpHi, -OpLo]
+ return SrcStatus::IS_LO_NEG;
+ }
+ break;
+ case SrcStatus::IS_UPPER_HALF:
+ // Vector of 2:
+ // Src = CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -OpUpper
+ //
+ // Scalar:
+ // Src = CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -OpUpper
+ return SrcStatus::IS_UPPER_HALF_NEG;
+ case SrcStatus::IS_LOWER_HALF:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // Src = CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -OpLower
+ return SrcStatus::IS_LOWER_HALF_NEG;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // Src = CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = OpLower
+ return SrcStatus::IS_LOWER_HALF;
+ }
+ break;
+ case SrcStatus::IS_UPPER_HALF_NEG:
+ // Vector of 2:
+ // Src = -CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -(-OpUpper) = OpUpper
+ //
+ // Scalar:
+ // Src = -CurrUpper
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -(-OpUpper) = OpUpper
+ return SrcStatus::IS_UPPER_HALF;
+ case SrcStatus::IS_LOWER_HALF_NEG:
+ if (NegType == TypeClass::VECTOR_OF_TWO) {
+ // Vector of 2:
+ // Src = -CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
+ // Src = -(-OpLower) = OpLower
+ return SrcStatus::IS_LOWER_HALF;
+ }
+ if (NegType == TypeClass::SCALAR) {
+ // Scalar:
+ // Src = -CurrLower
+ // Curr = [CurrUpper, CurrLower]
+ // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
+ // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
+ // Src = -OpLower
+ return SrcStatus::IS_LOWER_HALF_NEG;
+ }
+ break;
+ default:
+ llvm_unreachable("unexpected SrcStatus");
+ }
+}
+
+static std::optional<std::pair<Register, SrcStatus>>
+calcNextStatus(std::pair<Register, SrcStatus> Curr,
+ const MachineRegisterInfo &MRI) {
+ const MachineInstr *MI = MRI.getVRegDef(Curr.first);
+
+ unsigned Opc = MI->getOpcode();
+
+ // Handle general Opc cases.
+ switch (Opc) {
+ case AMDGPU::G_BITCAST:
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), Curr.second});
+ case AMDGPU::COPY:
+ if (MI->getOperand(1).getReg().isPhysical())
+ return std::nullopt;
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), Curr.second});
+ case AMDGPU::G_FNEG: {
+ SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
+ if (Stat == SrcStatus::INVALID)
+ return std::nullopt;
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), Stat});
+ }
+ default:
+ break;
+ }
+
+ // Calc next Stat from current Stat.
+ switch (Curr.second) {
+ case SrcStatus::IS_SAME:
+ if (isTruncHalf(MI, MRI))
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
+ else if (isUnmergeHalf(MI, MRI)) {
+ if (Curr.first == MI->getOperand(0).getReg())
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
+ }
+ break;
+ case SrcStatus::IS_HI_NEG:
+ if (isTruncHalf(MI, MRI)) {
+ // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
+ // = [OpLowerHi, OpLowerLo]
+ // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
+ // = [-OpLowerHi, OpLowerLo]
+ // = -OpLower
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
+ }
+ if (isUnmergeHalf(MI, MRI)) {
+ if (Curr.first == MI->getOperand(0).getReg())
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
+ }
+ break;
+ case SrcStatus::IS_UPPER_HALF:
+ if (isShlHalf(MI, MRI))
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
+ break;
+ case SrcStatus::IS_LOWER_HALF:
+ if (isLshrHalf(MI, MRI))
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
+ break;
+ case SrcStatus::IS_UPPER_HALF_NEG:
+ if (isShlHalf(MI, MRI))
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
+ break;
+ case SrcStatus::IS_LOWER_HALF_NEG:
+ if (isLshrHalf(MI, MRI))
+ return std::optional<std::pair<Register, SrcStatus>>(
+ {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
+ break;
+ default:
+ break;
+ }
+ return std::nullopt;
+}
+
+/// This is used to control valid status that current MI supports. For example,
+/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
+/// bit on VOP3P.
+/// The class can be further extended to recognize support on SEL, NEG, ABS bit
+/// for
diff erent MI on
diff erent arch
+class SearchOptions {
+private:
+ bool HasNeg = false;
+ // Assume all complex pattern of VOP3P have opsel.
+ bool HasOpsel = true;
+
+public:
+ SearchOptions(Register Reg, const MachineRegisterInfo &MRI) {
+ const MachineInstr *MI = MRI.getVRegDef(Reg);
+ unsigned Opc = MI->getOpcode();
+
+ if (Opc < TargetOpcode::GENERIC_OP_END) {
+ // Keep same for generic op.
+ HasNeg = true;
+ } else if (Opc == TargetOpcode::G_INTRINSIC) {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
+ // Only float point intrinsic has neg & neg_hi bits.
+ if (IntrinsicID == Intrinsic::amdgcn_fdot2)
+ HasNeg = true;
+ }
+ }
+ bool checkOptions(SrcStatus Stat) const {
+ if (!HasNeg &&
+ (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
+ return false;
+ }
+ if (!HasOpsel &&
+ (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
+ return false;
+ }
+ return true;
+ }
+};
+
+static SmallVector<std::pair<Register, SrcStatus>>
+getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
+ int MaxDepth = 3) {
+ int Depth = 0;
+ auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
+ SmallVector<std::pair<Register, SrcStatus>> Statlist;
+
+ while (Depth <= MaxDepth && Curr.has_value()) {
+ Depth++;
+ if (SO.checkOptions(Curr.value().second))
+ Statlist.push_back(Curr.value());
+ Curr = calcNextStatus(Curr.value(), MRI);
+ }
+
+ return Statlist;
+}
+
+static std::pair<Register, SrcStatus>
+getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO,
+ int MaxDepth = 3) {
+ int Depth = 0;
+ std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
+ auto Curr = calcNextStatus(LastSameOrNeg, MRI);
+
+ while (Depth <= MaxDepth && Curr.has_value()) {
+ Depth++;
+ SrcStatus Stat = Curr.value().second;
+ if (SO.checkOptions(Stat)) {
+ if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
+ Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG)
+ LastSameOrNeg = Curr.value();
+ }
+ Curr = calcNextStatus(Curr.value(), MRI);
+ }
+
+ return LastSameOrNeg;
+}
+
+static bool isSameBitWidth(Register Reg1, Register Reg2,
+ const MachineRegisterInfo &MRI) {
+ unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
+ unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
+ return Width1 == Width2;
+}
+
+static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
+ // SrcStatus::IS_LOWER_HALF remain 0.
+ if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == SrcStatus::IS_UPPER_HALF)
+ Mods |= SISrcMods::OP_SEL_1;
+ else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+ else if (HiStat == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+
+ if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG;
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == SrcStatus::IS_UPPER_HALF)
+ Mods |= SISrcMods::OP_SEL_0;
+ else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
+ Mods |= SISrcMods::NEG;
+ else if (LoStat == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG;
+
+ return Mods;
+}
+
+static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
+ Register RootReg, const SIInstrInfo &TII,
+ const MachineRegisterInfo &MRI) {
+ auto IsHalfState = [](SrcStatus S) {
+ return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG ||
+ S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG;
+ };
+ return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
+ IsHalfState(HiStat);
+}
+
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
+ Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
unsigned Mods = 0;
- MachineInstr *MI = MRI.getVRegDef(Src);
+ // No modification if Root type is not form of <2 x Type>.
+ if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {RootReg, Mods};
+ }
+
+ SearchOptions SO(RootReg, MRI);
+
+ std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
- if (MI->getOpcode() == AMDGPU::G_FNEG &&
- // It's possible to see an f32 fneg here, but unlikely.
- // TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
+ if (Stat.second == SrcStatus::IS_BOTH_NEG)
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- Src = MI->getOperand(1).getReg();
- MI = MRI.getVRegDef(Src);
+ else if (Stat.second == SrcStatus::IS_HI_NEG)
+ Mods ^= SISrcMods::NEG_HI;
+ else if (Stat.second == SrcStatus::IS_LO_NEG)
+ Mods ^= SISrcMods::NEG;
+
+ MachineInstr *MI = MRI.getVRegDef(Stat.first);
+
+ if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
+ (IsDOT && Subtarget->hasDOTOpSelHazard())) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Stat.first, Mods};
}
- // TODO: Handle G_FSUB 0 as fneg
+ SmallVector<std::pair<Register, SrcStatus>> StatlistHi =
+ getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
- // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
- (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
+ if (StatlistHi.empty()) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Stat.first, Mods};
+ }
+ SmallVector<std::pair<Register, SrcStatus>> StatlistLo =
+ getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
+
+ if (StatlistLo.empty()) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Stat.first, Mods};
+ }
+
+ for (int I = StatlistHi.size() - 1; I >= 0; I--) {
+ for (int J = StatlistLo.size() - 1; J >= 0; J--) {
+ if (StatlistHi[I].first == StatlistLo[J].first &&
+ isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
+ StatlistHi[I].first, RootReg, TII, MRI))
+ return {StatlistHi[I].first,
+ updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
+ }
+ }
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
- return std::pair(Src, Mods);
+ return {Stat.first, Mods};
}
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
+int64_t getAllKindImm(const MachineOperand *Op) {
+ switch (Op->getType()) {
+ case MachineOperand::MachineOperandType::MO_Immediate:
+ return Op->getImm();
+ case MachineOperand::MachineOperandType::MO_CImmediate:
+ return Op->getCImm()->getSExtValue();
+ case MachineOperand::MachineOperandType::MO_FPImmediate:
+ return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
+ default:
+ llvm_unreachable("not an imm type");
+ }
+}
- Register Src;
+static bool checkRB(Register Reg, unsigned int RBNo,
+ const AMDGPURegisterBankInfo &RBI,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
+ const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
+ return RB->getID() == RBNo;
+}
+
+// This function is used to get the correct register bank for returned reg.
+// Assume:
+// 1. VOP3P is always legal for VGPR.
+// 2. RootOp's regbank is legal.
+// Thus
+// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
+// 2. If RootOp is VGPR, then NewOp must be VGPR.
+static Register getLegalRegBank(Register NewReg, Register RootReg,
+ const AMDGPURegisterBankInfo &RBI,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ const SIInstrInfo &TII) {
+ // RootOp can only be VGPR or SGPR (some hand written cases such as.
+ // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
+ if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
+ checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
+ return NewReg;
+
+ MachineInstr *MI = MRI.getVRegDef(RootReg);
+ if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
+ // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
+ return RootReg;
+ }
+
+ MachineBasicBlock *BB = MI->getParent();
+ Register DstReg = MRI.cloneVirtualRegister(RootReg);
+
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
+ .addReg(NewReg);
+
+ // Only accept VGPR.
+ return MIB->getOperand(0).getReg();
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
+ bool IsDOT) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ Register Reg;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
+ std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
+ Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
- Register Src;
- unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
+ return selectVOP3PRetHelper(Root);
+}
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
- }};
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
+
+ return selectVOP3PRetHelper(Root, true);
}
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6c3f3026e877a..8e9e573147a86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -188,8 +188,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
std::pair<Register, unsigned>
- selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
+ selectVOP3PModsImpl(Register RootReg, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
selectVOP3PMods(MachineOperand &Root) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 543f8e413abd8..e03aa18d3147f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -106,6 +106,104 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b)
ret <2 x half> %mul
}
+define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16_partial_neg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16_partial_neg:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0
+; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v2f16_partial_neg:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0]
+; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = fneg <2 x half> %b3
+ %mul1 = fmul <2 x half> %b3, %a
+ %mul2 = fmul <2 x half> %b4, %mul1
+ ret <2 x half> %mul2
+}
+
+define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_hi:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_hi:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_hi:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = extractelement <2 x half> %b3, i64 1
+ %tmp = insertelement <2 x half> poison, half %b4, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
+
+define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: fmul_v2_half_neg_lo1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: fmul_v2_half_neg_lo1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_v2_half_neg_lo1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %b1 = bitcast <2 x half> %b to float
+ %b2 = fneg float %b1
+ %b3 = bitcast float %b2 to <2 x half>
+ %b4 = extractelement <2 x half> %b3, i64 0
+ %tmp = insertelement <2 x half> poison, half %b4, i64 0
+ %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0>
+ %mul = fmul <2 x half> %a, %k
+ ret <2 x half> %mul
+}
+
define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
; GFX9-LABEL: v_fmul_v3f16:
; GFX9: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index 744a5b7feb48d..8f0ae8c47098a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -304,8 +304,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_a:
@@ -319,8 +318,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -331,8 +329,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_b:
@@ -346,8 +343,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 9e623494a5a04..287a009ca1405 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -289,22 +289,19 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -315,22 +312,19 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_shuffle10_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index bef38c1a65ef8..0e1e5e4c4987c 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1982,9 +1982,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
-; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
@@ -2046,9 +2044,7 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(
; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3
-; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
@@ -2110,12 +2106,8 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3
-; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0]
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
@@ -2351,9 +2343,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; PACKED-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -2394,9 +2384,7 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0
; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT: s_xor_b32 s2, s2, 0x80000000
-; PACKED-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000
-; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1]
+; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 2f1dfa11fd34d..141b86a24c1c4 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -742,9 +742,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-GISEL-NEXT: ; return to shader part epilog
;
; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -784,8 +783,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-GISEL-NEXT: ; return to shader part epilog
;
; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
@@ -810,8 +808,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-GISEL-NEXT: ; return to shader part epilog
; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-SDAG: ; %bb.0:
@@ -824,8 +821,7 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog
; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX10PLUS-GISEL: ; %bb.0:
-; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
-; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0
+; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog
%val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret <2 x half> %val
More information about the llvm-branch-commits
mailing list