[llvm] 40f35ce - [AMDGPU] gfx11 VOP3P instruction MC support
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 8 11:00:00 PDT 2022
Author: Joe Nash
Date: 2022-06-08T13:32:01-04:00
New Revision: 40f35cef894a4f899d1a0a31dd9600b9ce5e769b
URL: https://github.com/llvm/llvm-project/commit/40f35cef894a4f899d1a0a31dd9600b9ce5e769b
DIFF: https://github.com/llvm/llvm-project/commit/40f35cef894a4f899d1a0a31dd9600b9ce5e769b.diff
LOG: [AMDGPU] gfx11 VOP3P instruction MC support
Includes dpp versions of VOP3P instructions.
Patch 18/N for upstreaming of AMDGPU gfx11 architecture
Depends on D126917
Reviewed By: rampitec, #amdgpu
Differential Revision: https://reviews.llvm.org/D126978
Added:
llvm/test/MC/AMDGPU/gfx11_vop3p.s
Modified:
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/lib/Target/AMDGPU/VOPInstructions.td
llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
llvm/test/MC/AMDGPU/gfx11_err.s
llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b2ed4b48f39e3..5cfd2dc83aa00 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8255,6 +8255,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if (OpIdx == -1)
break;
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+ if (ModIdx == -1)
+ continue;
+
uint32_t ModVal = 0;
if ((OpSel & (1 << J)) != 0)
@@ -8269,8 +8274,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
if ((NegHi & (1 << J)) != 0)
ModVal |= SISrcMods::NEG_HI;
- int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
-
Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
}
}
@@ -8636,7 +8639,9 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+ if (Desc.TSFlags & SIInstrFlags::VOP3P)
+ cvtVOP3P(Inst, Operands, OptionalIdx);
+ else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4f6ebe7fdc44d..d312e13bc7d08 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -441,8 +441,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
MI = MCInst(); // clear
Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
Address);
- if (Res)
+ if (Res) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+ convertVOP3PDPPInst(MI);
break;
+ }
}
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -729,18 +732,20 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
-
- // Insert dummy unused src modifiers.
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src0_modifiers);
-
- if (MI.getNumOperands() < DescNumOps &&
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
- insertNamedMCOperand(MI, MCOperand::createImm(0),
- AMDGPU::OpName::src1_modifiers);
-
+ if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
+ convertVOP3PDPPInst(MI);
+ } else {
+ // Insert dummy unused src modifiers.
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src0_modifiers);
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0),
+ AMDGPU::OpName::src1_modifiers);
+ }
return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
}
@@ -882,6 +887,56 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
return MCDisassembler::Success;
}
+// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
+// decoder only adds to src_modifiers, so manually add the bits to the other
+// operands.
+DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+ unsigned Opc = MI.getOpcode();
+ unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
+
+ const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+ unsigned OpSel = 0;
+ unsigned OpSelHi = 0;
+ unsigned NegLo = 0;
+ unsigned NegHi = 0;
+ for (int J = 0; J < 3; ++J) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+ if (OpIdx == -1)
+ break;
+ unsigned Val = MI.getOperand(OpIdx).getImm();
+
+ OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+ OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+ NegLo |= !!(Val & SISrcMods::NEG) << J;
+ NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+ }
+
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+ AMDGPU::OpName::op_sel);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+ AMDGPU::OpName::op_sel_hi);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+ AMDGPU::OpName::neg_lo);
+ if (MI.getNumOperands() < DescNumOps &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
+ insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+ AMDGPU::OpName::neg_hi);
+
+ return MCDisassembler::Success;
+}
+
DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
int ImmLitIdx) const {
assert(HasLiteral && "Should have decoded a literal");
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 6011d1bfe186e..8704bd0de47d9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -162,6 +162,7 @@ class AMDGPUDisassembler : public MCDisassembler {
DecodeStatus convertSDWAInst(MCInst &MI) const;
DecodeStatus convertDPP8Inst(MCInst &MI) const;
DecodeStatus convertMIMGInst(MCInst &MI) const;
+ DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 4084b01e0a026..7ce2a90073256 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1496,6 +1496,7 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index aecd40f411ec1..c9f0ff0ae0f54 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,19 +10,33 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
+class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+ bit HasDPP = 0> : VOP3_Profile<P, Features> {
+ let IsVOP3P = 1;
+ let HasExtVOP3DPP = HasDPP;
+ // We do not want to print src modifiers for vop3p because the bits are
+ // overloaded in meaning and the logic in printOperandAndFPInputMods is
+ // wrong for vop3p
+ let AsmVOP3DPPBase = AsmVOP3P;
+}
+
// Used for FMA_MIX* and MAD_MIX* insts
// Their operands are only sort of f16 operands. Depending on
// op_sel_hi, these may be interpreted as f32. The inline immediate
// values are really f16 converted to f32, so we treat these as f16
// operands.
class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
- bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+ bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
bit UseTiedOutput = useTiedOutput;
dag srcs =
(ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+ dag dpp_srcs =
+ (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+ FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
// FIXME: clampmod0 misbehaves with the non-default vdst_in
// following it. For now workaround this by requiring clamp
@@ -35,8 +49,10 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
// We use Ins64 because that is the one which populates InOperandList
// due to the logic in class VOP3_Pseudo
let Ins64 = !con(srcs, mods);
+ let InsVOP3Base = !con(dpp_srcs, mods);
let Asm64 =
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+ let AsmVOP3DPPBase = Asm64;
}
multiclass VOP3PInst<string OpName, VOPProfile P,
@@ -45,6 +61,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
!if (P.HasModifiers,
getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
getVOP3Pat<P, node>.ret)>;
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName #"_dpp";
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
// Non-packed instructions that use the VOP3P encoding.
@@ -54,36 +77,45 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
}
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName#"_dpp";
+ let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+ }
+ } // end SubtargetPredicate = isGFX11Plus
}
let isCommutable = 1 in {
-defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
let FPDPRounding = 1 in {
-defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
} // End FPDPRounding = 1
-defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
-defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
-defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
-defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
-defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
-defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
-defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
let SubtargetPredicate = HasVOP3PInsts in {
@@ -296,34 +328,63 @@ let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
- VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+ VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
- VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+ VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot7Insts in {
defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
- VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+ VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
AMDGPUfdot2, 1/*ExplicitClamp*/>;
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End SubtargetPredicate = HasDot7Insts
let SubtargetPredicate = HasDot1Insts in {
defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
- VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+ VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasDot8Insts in {
+
+defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
+ VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
+ null_frag, 1>;
+
+} // End SubtargetPredicate = HasDot8Insts
+
} // End let IsDOT = 1
+multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
+ let IsDOT = 1 in
+ defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+ null_frag, 1>;
+ // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
+ // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
+ def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
+ (DotIUVOP3PMods i32:$src1_mods), i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (!cast<Instruction>(NAME) $src0_mods, i32:$src0,
+ $src1_mods, i32:$src1,
+ (i32 8), i32:$src2, i1:$clamp)
+ >;
+}
+
+let SubtargetPredicate = HasDot8Insts in {
+defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>;
+defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>;
+} // End SubtargetPredicate = HasDot8Insts
+
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
@@ -364,18 +425,18 @@ def VDst_256 : VOPDstOperand<VReg_256>;
def VDst_512 : VOPDstOperand<VReg_512>;
def VDst_1024 : VOPDstOperand<VReg_1024>;
-def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
let Src0RC64 = ARegSrc_32;
}
-def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
let DstRC = ADst_32;
let Src0RC64 = VCSrc_b32;
}
class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
RegisterOperand SrcABRC = AVSrc_32>
- : VOP3_Profile<P, VOP3_MAI> {
+ : VOP3P_Profile<P, VOP3_MAI> {
let DstRC = _DstRC;
let Src0RC64 = SrcABRC;
let Src1RC64 = SrcABRC;
@@ -386,7 +447,9 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
let HasOMod = 0;
let HasModifiers = 0;
let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+ let AsmVOP3DPPBase = Asm64;
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+ let InsVOP3Base = Ins64;
// Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
// We then create two versions of the instruction: with tied dst and src2
// and with the earlyclobber flag on the dst. This is stricter than the
@@ -601,10 +664,10 @@ def MAIInstInfoTable : GenericTable {
}
let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
- defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
- defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
- defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
- defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+ defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+ defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+ defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+ defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
@@ -614,6 +677,72 @@ def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
// Begin Real Encodings
//===----------------------------------------------------------------------===//
+class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
+ string opName = ps.OpName>
+ : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let AssemblerPredicate = HasDPP16;
+ let SubtargetPredicate = HasDPP16;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
+ : VOP3P_DPP8<op, opName, ps.Pfl> {
+ let hasSideEffects = ps.hasSideEffects;
+ let Defs = ps.Defs;
+ let SchedRW = ps.SchedRW;
+ let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
+}
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus,
+ DecoderNamespace = "GFX11" in {
+
+ multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name),
+ SIEncodingFamily.GFX11, asmName>,
+ VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+ }
+
+ multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp_gfx11
+ : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
+ SIEncodingFamily.GFX11> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPPGFX11";
+ }
+ }
+
+ multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8GFX11";
+ }
+ }
+
+ multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
+ : VOP3P_Real_gfx11<op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;
+defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
+defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;
+
//===----------------------------------------------------------------------===//
// GFX8 (VI)
//===----------------------------------------------------------------------===//
@@ -841,35 +970,41 @@ let SubtargetPredicate = HasPackedFP32Ops in {
// GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in {
multiclass VOP3P_Real_gfx10<bits<7> op> {
def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
}
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
-
-defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>;
-defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>;
-defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
-defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>;
-defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>;
-defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>;
-defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>;
-defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>;
-defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>;
-defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>;
-defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>;
-defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>;
-defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>;
-defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>;
-defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>;
-defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>;
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1
+
+multiclass VOP3P_Real_gfx10_gfx11<bits<7> op>
+ : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op>
+ : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>;
+
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>;
+defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>;
+defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
let SubtargetPredicate = HasDot2Insts in {
@@ -880,9 +1015,9 @@ defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
let SubtargetPredicate = HasDot7Insts in {
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
-defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>;
-defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>;
} // End SubtargetPredicate = HasDot7Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8413fd5181108..d82a5d3831659 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -316,7 +316,6 @@ class VOP3be <VOPProfile P> : Enc64 {
class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
bits<8> vdst;
- // neg, neg_hi, op_sel put in srcN_modifiers
bits<4> src0_modifiers;
bits<9> src0;
bits<4> src1_modifiers;
@@ -412,6 +411,8 @@ class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
let Inst{31-23} = 0x198; //encoding
}
+class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
+
class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
let Inst{25-17} = op;
}
@@ -705,6 +706,39 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
}
+class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
+ bits<4> src0_modifiers;
+ bits<4> src1_modifiers;
+ bits<4> src2_modifiers;
+ bits<1> clamp;
+
+ let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+ let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+ let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+ let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+ let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+ let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+ let Inst{22-16} = op;
+ let Inst{31-23} = 0x198; // encoding
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+ let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
+class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+ bits<8> vdst;
+ bits<9> src1;
+ bits<9> src2;
+
+ let Inst{7-0} = vdst;
+ let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+ let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
@@ -847,6 +881,25 @@ class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
let Inst{95-92} = row_mask;
}
+class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
+ dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+ string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+ VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3P_DPPe_Common<op, P>,
+ VOP3_DPPe_Fields {
+
+ let VOP3P = 1;
+
+ let Inst{40-32} = 0xfa;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{80-72} = dpp_ctrl;
+ let Inst{82} = !if(IsDPP16, fi, ?);
+ let Inst{83} = bound_ctrl;
+
+ // Inst{87-84} ignored by hw
+ let Inst{91-88} = bank_mask;
+ let Inst{95-92} = row_mask;
+}
+
class VOP_DPP8e<VOPProfile P> : Enc64 {
bits<8> src0;
bits<24> dpp8;
@@ -905,6 +958,16 @@ class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
let Inst{95-72} = dpp8{23-0};
}
+class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
+ VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
+ VOP3_DPP8e_Fields {
+
+ let VOP3P = 1;
+ let Inst{40-32} = fi;
+ let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{95-72} = dpp8{23-0};
+}
+
def DPP8Mode {
int FI_0 = 0xE9;
int FI_1 = 0xEA;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
index 9791f5019d47f..a55b847f44988 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
@@ -88,6 +88,23 @@ v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[2,2,2,2,4,4,4,4]
// W32: encoding: [0x05,0x6a,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x92,0x44,0x92]
// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error
+; VOP3P
+v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+
+v_fma_mix_f32 v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1
+// GFX11: encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92]
+
+; For test purpose only. OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to all 1
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
+
+v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+
; DPP
; VOP1->3
@@ -191,3 +208,16 @@ v_add_co_u32_e64_dpp v243, vcc, v243, v2 clamp quad_perm:[1,2,3,1] bank_mask: 0x
// W64: encoding: [0xf3,0xea,0x00,0xd7,0xfa,0x04,0x02,0x00,0xf3,0x79,0x04,0xf5]
// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error
+
+; VOP3P
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
+
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+
+v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
+// GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
+
+v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0
+// GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]
diff --git a/llvm/test/MC/AMDGPU/gfx11_err.s b/llvm/test/MC/AMDGPU/gfx11_err.s
index d8e1187ed79ae..d15935a6eca4a 100644
--- a/llvm/test/MC/AMDGPU/gfx11_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_err.s
@@ -44,3 +44,20 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// On GFX11, v_dot8_i32_i4 is a valid SP3 alias for v_dot8_i32_iu4.
+// However, we intentionally leave it unimplemented because on other
+// processors v_dot8_i32_i4 denotes an instruction of a
diff erent
+// behaviour, which is considered potentially dangerous.
+v_dot8_i32_i4 v0, v1, v2, v3
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+// On GFX11, v_dot4_i32_i8 is a valid SP3 alias for v_dot4_i32_iu8.
+// However, we intentionally leave it unimplemented because on other
+// processors v_dot4_i32_i8 denotes an instruction of a
diff erent
+// behaviour, which is considered potentially dangerous.
+v_dot4_i32_i8 v0, v1, v2, v3
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4c_i32_i8 v0, v1, v2
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx11_vop3p.s b/llvm/test/MC/AMDGPU/gfx11_vop3p.s
new file mode 100644
index 0000000000000..dc56baa77e1fd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_vop3p.s
@@ -0,0 +1,213 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1100 %s | FileCheck --check-prefix=GFX11 %s
+
+v_pk_fma_f16 v8, v0, s0, v1 clamp
+// GFX11: encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_add_u16 v1, v2, v3 clamp
+// GFX11: encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2 clamp
+// GFX11: encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_mul_lo_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_add_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_sub_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_lshlrev_b16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_lshrrev_b16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_ashrrev_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_add_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_fma_f16 v0, v1, v2, v3
+// GFX11: encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_pk_add_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_mul_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18]
+
+//
+// Test op_sel/op_sel_hi
+//
+
+v_pk_add_u16 v1, v2, v3
+// GFX11: encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+//
+// Test src2 op_sel/op_sel_hi
+//
+v_pk_fma_f16 v8, v0, s0, v1
+// GFX11: encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04]
+
+//
+// Test neg_lo/neg_hi
+//
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+//
+// DOT
+//
+v_dot4_i32_iu8 v3, v4, v5, v6
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c]
+
+v_dot4_i32_iu8 v3, v4, v5, 0xf neg_lo:[1,1]
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a]
+
+v_dot4_u32_u8 v3, v4, v5, v6
+// GFX11: v_dot4_u32_u8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c]
+
+v_dot4_i32_iu8 v3, v4, v5, 0xf
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a]
+
+v_dot8_i32_iu4 v3, v4, v5, 0xf neg_lo:[1,0]
+// GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a]
+
+v_dot8_i32_iu4 v3, v4, v5, v0 neg_lo:[0,0]
+// GFX11: v_dot8_i32_iu4 v3, v4, v5, v0 ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c]
+
+v_dot8_u32_u4 v0, v1, v2, v3
+// GFX11: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_f16 v0, v1, v2, v3
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c]
+
+v_dot2_f32_bf16 v0, v1, v2, v3
+// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1]
+// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c]
+
+//
+// FMA_MIX
+//
+v_fma_mix_f32 v0, v1, v2, v3
+// GFX11: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04]
+
+v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04]
+
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3)
+// GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44]
+
+v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp
+// GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
index 3e61d3bb29a4f..875427aaaa9db 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
@@ -14550,6 +14550,27 @@
# GFX11: v_dot2acc_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04]
0x01,0x05,0x0a,0x04
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c
+
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a
+
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a
+
+# GFX11: v_dot4_u32_u8 v3, v4, v5, v6 ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c]
+0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c
+
+# GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a]
+0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a
+
+# GFX11: v_dot8_i32_iu4 v3, v4, v5, v0 ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c]
+0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c
+
+# GFX11: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c
+
# GFX11: v_exp_f32_e32 v255, v1 ; encoding: [0x01,0x4b,0xfe,0x7f]
0x01,0x4b,0xfe,0x7f
@@ -23246,6 +23267,132 @@
# GFX11: v_xor3_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04]
0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04
+# GFX11: v_pk_add_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 clamp ; encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_ashrrev_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_fma_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 clamp ; encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04]
+0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04]
+0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04
+
+# GFX11: v_pk_lshlrev_b16 v0, v1, v2 ; encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_lshrrev_b16 v0, v1, v2 ; encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_i16 v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18]
+0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_mul_f16 v0, v1, v2 ; encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_mul_lo_u16 v0, v1, v2 ; encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_sub_i16 v0, v1, v2 ; encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18
+
# W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00]
# W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00]
0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00
@@ -24560,6 +24707,69 @@
# GFX11: v_xnor_b32_e64_dpp v8, v5, v2 quad_perm:[1,0,2,3] row_mask:0x1 bank_mask:0x0 ; encoding: [0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10]
0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10
+# GFX11: v_dot2_f32_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c]
+0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c
+
+# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c]
+0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c
+
+# GFX11: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04]
+0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04
+
+# GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04]
+0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04
+
+# GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44]
+0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
+0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00]
+0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05]
+0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
+0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
+
+# GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]
+0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f
+
+# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92]
+0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
+0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92
+
# GFX11: v_permlane64_b32 v5, v1 ; encoding: [0x01,0xcf,0x0a,0x7e]
0x01,0xcf,0x0a,0x7e
More information about the llvm-commits
mailing list