[llvm] 40f35ce - [AMDGPU] gfx11 VOP3P instruction MC support

Wed Jun 8 11:00:00 PDT 2022

Author: Joe Nash
Date: 2022-06-08T13:32:01-04:00
New Revision: 40f35cef894a4f899d1a0a31dd9600b9ce5e769b

URL: https://github.com/llvm/llvm-project/commit/40f35cef894a4f899d1a0a31dd9600b9ce5e769b
DIFF: https://github.com/llvm/llvm-project/commit/40f35cef894a4f899d1a0a31dd9600b9ce5e769b.diff

LOG: [AMDGPU] gfx11 VOP3P instruction MC support

Includes dpp versions of VOP3P instructions.

Patch 18/N for upstreaming of AMDGPU gfx11 architecture

Depends on D126917

Reviewed By: rampitec, #amdgpu

Differential Revision: https://reviews.llvm.org/D126978

Added: 
    llvm/test/MC/AMDGPU/gfx11_vop3p.s

Modified: 
    llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
    llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/VOP3PInstructions.td
    llvm/lib/Target/AMDGPU/VOPInstructions.td
    llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
    llvm/test/MC/AMDGPU/gfx11_err.s
    llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b2ed4b48f39e3..5cfd2dc83aa00 100644

--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8255,6 +8255,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
     if (OpIdx == -1)
       break;
 
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+    if (ModIdx == -1)
+      continue;
+
     uint32_t ModVal = 0;
 
     if ((OpSel & (1 << J)) != 0)
@@ -8269,8 +8274,6 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
     if ((NegHi & (1 << J)) != 0)
       ModVal |= SISrcMods::NEG_HI;
 
-    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
-
     Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
   }
 }
@@ -8636,7 +8639,9 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
   if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
   }
-  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+  if (Desc.TSFlags & SIInstrFlags::VOP3P)
+    cvtVOP3P(Inst, Operands, OptionalIdx);
+  else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4f6ebe7fdc44d..d312e13bc7d08 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -441,8 +441,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       MI = MCInst(); // clear
       Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
                                           Address);
-      if (Res)
+      if (Res) {
+        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+          convertVOP3PDPPInst(MI);
         break;
+      }
     }
     // Reinitialize Bytes
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -729,18 +732,20 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
-
-  // Insert dummy unused src modifiers.
-  if (MI.getNumOperands() < DescNumOps &&
-      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(0),
-                         AMDGPU::OpName::src0_modifiers);
-
-  if (MI.getNumOperands() < DescNumOps &&
-      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(0),
-                         AMDGPU::OpName::src1_modifiers);
-
+  if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
+    convertVOP3PDPPInst(MI);
+  } else {
+    // Insert dummy unused src modifiers.
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1)
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src0_modifiers);
+
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers) != -1)
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src1_modifiers);
+  }
   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
 }
 
@@ -882,6 +887,56 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+// Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
+// decoder only adds to src_modifiers, so manually add the bits to the other
+// operands.
+DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+  unsigned Opc = MI.getOpcode();
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
+
+  const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+                        AMDGPU::OpName::src1_modifiers,
+                        AMDGPU::OpName::src2_modifiers};
+  unsigned OpSel = 0;
+  unsigned OpSelHi = 0;
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+    if (OpIdx == -1)
+      break;
+    unsigned Val = MI.getOperand(OpIdx).getImm();
+
+    OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+    OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+    NegLo |= !!(Val & SISrcMods::NEG) << J;
+    NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+  }
+
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+                         AMDGPU::OpName::op_sel);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+                         AMDGPU::OpName::op_sel_hi);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+                         AMDGPU::OpName::neg_lo);
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
+    insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+                         AMDGPU::OpName::neg_hi);
+
+  return MCDisassembler::Success;
+}
+
 DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
                                                 int ImmLitIdx) const {
   assert(HasLiteral && "Should have decoded a literal");

diff  --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 6011d1bfe186e..8704bd0de47d9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -162,6 +162,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   DecodeStatus convertSDWAInst(MCInst &MI) const;
   DecodeStatus convertDPP8Inst(MCInst &MI) const;
   DecodeStatus convertMIMGInst(MCInst &MI) const;
+  DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
 
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 4084b01e0a026..7ce2a90073256 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1496,6 +1496,7 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
 def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
 
 def VOP3PModsDOT  : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def DotIUVOP3PMods  : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
 
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
 

diff  --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index aecd40f411ec1..c9f0ff0ae0f54 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -10,19 +10,33 @@
 // VOP3P Classes
 //===----------------------------------------------------------------------===//
 
+class VOP3P_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
+                    bit HasDPP = 0> : VOP3_Profile<P, Features> {
+  let IsVOP3P = 1;
+  let HasExtVOP3DPP = HasDPP;
+  // We do not want to print src modifiers for vop3p because the bits are
+  // overloaded in meaning and the logic in printOperandAndFPInputMods is
+  // wrong for vop3p
+  let AsmVOP3DPPBase = AsmVOP3P;
+}
+
 // Used for FMA_MIX* and MAD_MIX* insts
 // Their operands are only sort of f16 operands. Depending on
 // op_sel_hi, these may be interpreted as f32. The inline immediate
 // values are really f16 converted to f32, so we treat these as f16
 // operands.
 class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
-                    bit useTiedOutput = 0> : VOP3_Profile<P, Features> {
+                    bit useTiedOutput = 0> : VOP3P_Profile<P, Features, 1> {
     bit UseTiedOutput = useTiedOutput;
 
     dag srcs =
           (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
                FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
                FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
+    dag dpp_srcs =
+          (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
+               FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
+               FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
 
            // FIXME: clampmod0 misbehaves with the non-default vdst_in
            // following it. For now workaround this by requiring clamp
@@ -35,8 +49,10 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
     // We use Ins64 because that is the one which populates InOperandList
     // due to the logic in class VOP3_Pseudo
     let Ins64 = !con(srcs, mods);
+    let InsVOP3Base = !con(dpp_srcs, mods);
     let Asm64 =
       "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+    let AsmVOP3DPPBase = Asm64;
 }
 
 multiclass VOP3PInst<string OpName, VOPProfile P,
@@ -45,6 +61,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
                           !if (P.HasModifiers,
                                getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
                                getVOP3Pat<P, node>.ret)>;
+  let SubtargetPredicate = isGFX11Plus in {
+  if P.HasExtVOP3DPP then
+    def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+      let VOP3P = 1;
+      let PseudoInstr = OpName #"_dpp";
+    }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 
 // Non-packed instructions that use the VOP3P encoding.
@@ -54,36 +77,45 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
     let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
     let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
   }
+  let SubtargetPredicate = isGFX11Plus in {
+    if P.HasExtVOP3DPP then
+      def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+        let VOP3P = 1;
+        let PseudoInstr = OpName#"_dpp";
+        let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", "");
+        let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", "");
+      }
+  } // end SubtargetPredicate = isGFX11Plus
 }
 
 let isCommutable = 1 in {
-defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 
 let FPDPRounding = 1 in {
-defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
-defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
-defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
+defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
 } // End FPDPRounding = 1
-defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
-defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
+defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
-defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
-defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
 
-defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
-defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
-defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
-defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 }
 
-defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
+defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 
-defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
-defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
-defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
+defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>;
+defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>;
+defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>;
 
 
 let SubtargetPredicate = HasVOP3PInsts in {
@@ -296,34 +328,63 @@ let IsDOT = 1 in {
 let SubtargetPredicate = HasDot2Insts in {
 
 defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
-  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+  VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
 defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
-  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+  VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot7Insts in {
 
 defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
-  VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+  VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
   AMDGPUfdot2, 1/*ExplicitClamp*/>;
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 
 } // End SubtargetPredicate = HasDot7Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
 defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
-  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 
 } // End SubtargetPredicate = HasDot1Insts
+
+let SubtargetPredicate = HasDot8Insts  in {
+
+defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16",
+  VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
+  null_frag, 1>;
+
+} // End SubtargetPredicate = HasDot8Insts
+
 } // End let IsDOT = 1
 
+multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
+  let IsDOT = 1 in
+  defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>,
+                        null_frag, 1>;
+  // Dot-iu instructions consider input as signed if imod neg bits are set. Thus
+  // Dot-iu Intrinsics have extra operands and require separate codegen pattern.
+  def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
+                                 (DotIUVOP3PMods i32:$src1_mods), i32:$src1,
+                                 i32:$src2, (i1 timm:$clamp)),
+                 (!cast<Instruction>(NAME) $src0_mods, i32:$src0,
+                                           $src1_mods, i32:$src1,
+                                           (i32 8), i32:$src2, i1:$clamp)
+  >;
+}
+
+let SubtargetPredicate = HasDot8Insts  in {
+defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>;
+defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>;
+} // End SubtargetPredicate = HasDot8Insts
+
 def : UDot2Pat<V_DOT2_U32_U16>;
 def : SDot2Pat<V_DOT2_I32_I16>;
 
@@ -364,18 +425,18 @@ def VDst_256  : VOPDstOperand<VReg_256>;
 def VDst_512  : VOPDstOperand<VReg_512>;
 def VDst_1024 : VOPDstOperand<VReg_1024>;
 
-def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccRead : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
   let Src0RC64 = ARegSrc_32;
 }
 
-def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> {
+def VOPProfileAccWrite : VOP3P_Profile<VOP_I32_I32, VOP3_MAI> {
   let DstRC = ADst_32;
   let Src0RC64 = VCSrc_b32;
 }
 
 class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC,
                     RegisterOperand SrcABRC = AVSrc_32>
-  : VOP3_Profile<P, VOP3_MAI> {
+  : VOP3P_Profile<P, VOP3_MAI> {
   let DstRC = _DstRC;
   let Src0RC64 = SrcABRC;
   let Src1RC64 = SrcABRC;
@@ -386,7 +447,9 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let HasOMod = 0;
   let HasModifiers = 0;
   let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
+  let AsmVOP3DPPBase = Asm64;
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+  let InsVOP3Base = Ins64;
   // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
   // We then create two versions of the instruction: with tied dst and src2
   // and with the earlyclobber flag on the dst. This is stricter than the
@@ -601,10 +664,10 @@ def MAIInstInfoTable : GenericTable {
 }
 
 let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
-  defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
-  defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
-  defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
-  defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
+  defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+  defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+  defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+  defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
 } // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
 
 def : MnemonicAlias<"v_accvgpr_read",  "v_accvgpr_read_b32">;
@@ -614,6 +677,72 @@ def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
 // Begin Real Encodings
 //===----------------------------------------------------------------------===//
 
+class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
+                  string opName = ps.OpName>
+    : VOP3P_DPP<op, opName, ps.Pfl, 1>, SIMCInstr<ps.PseudoInstr, subtarget> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOP3P_DPP8<op, opName, ps.Pfl> {
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX11Plus,
+    DecoderNamespace = "GFX11" in {
+
+  multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name),
+                            SIEncodingFamily.GFX11, asmName>,
+                 VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+  }
+
+  multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+    def _dpp_gfx11
+        : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
+                      SIEncodingFamily.GFX11> {
+      let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = "DPPGFX11";
+    }
+  }
+
+  multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME,
+                       string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+    defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+    def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> {
+      let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+
+  multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
+                        string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
+      : VOP3P_Real_gfx11<op, backing_ps_name, asmName>,
+        VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
+        VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
+} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+
+defm V_DOT4_I32_IU8              : VOP3P_Real_gfx11 <0x16>;
+defm V_DOT8_I32_IU4              : VOP3P_Real_gfx11 <0x18>;
+defm V_DOT2_F32_BF16             : VOP3P_Real_gfx11 <0x1a>;
+
 //===----------------------------------------------------------------------===//
 // GFX8 (VI)
 //===----------------------------------------------------------------------===//
@@ -841,35 +970,41 @@ let SubtargetPredicate = HasPackedFP32Ops in {
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in {
   multiclass VOP3P_Real_gfx10<bits<7> op> {
     def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>,
                  VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>;
   }
-} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1
-
-defm V_PK_MAD_I16     : VOP3P_Real_gfx10<0x00>;
-defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10<0x01>;
-defm V_PK_ADD_I16     : VOP3P_Real_gfx10<0x02>;
-defm V_PK_SUB_I16     : VOP3P_Real_gfx10<0x03>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>;
-defm V_PK_MAX_I16     : VOP3P_Real_gfx10<0x07>;
-defm V_PK_MIN_I16     : VOP3P_Real_gfx10<0x08>;
-defm V_PK_MAD_U16     : VOP3P_Real_gfx10<0x09>;
-defm V_PK_ADD_U16     : VOP3P_Real_gfx10<0x0a>;
-defm V_PK_SUB_U16     : VOP3P_Real_gfx10<0x0b>;
-defm V_PK_MAX_U16     : VOP3P_Real_gfx10<0x0c>;
-defm V_PK_MIN_U16     : VOP3P_Real_gfx10<0x0d>;
-defm V_PK_FMA_F16     : VOP3P_Real_gfx10<0x0e>;
-defm V_PK_ADD_F16     : VOP3P_Real_gfx10<0x0f>;
-defm V_PK_MUL_F16     : VOP3P_Real_gfx10<0x10>;
-defm V_PK_MIN_F16     : VOP3P_Real_gfx10<0x11>;
-defm V_PK_MAX_F16     : VOP3P_Real_gfx10<0x12>;
-defm V_FMA_MIX_F32    : VOP3P_Real_gfx10<0x20>;
-defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10<0x21>;
-defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x22>;
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1
+
+multiclass VOP3P_Real_gfx10_gfx11<bits<7> op>
+  : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op>
+  : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>;
+
+defm V_PK_MAD_I16     : VOP3P_Real_gfx10_gfx11<0x00>;
+defm V_PK_MUL_LO_U16  : VOP3P_Real_gfx10_gfx11<0x01>;
+defm V_PK_ADD_I16     : VOP3P_Real_gfx10_gfx11<0x02>;
+defm V_PK_SUB_I16     : VOP3P_Real_gfx10_gfx11<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>;
+defm V_PK_MAX_I16     : VOP3P_Real_gfx10_gfx11<0x07>;
+defm V_PK_MIN_I16     : VOP3P_Real_gfx10_gfx11<0x08>;
+defm V_PK_MAD_U16     : VOP3P_Real_gfx10_gfx11<0x09>;
+defm V_PK_ADD_U16     : VOP3P_Real_gfx10_gfx11<0x0a>;
+defm V_PK_SUB_U16     : VOP3P_Real_gfx10_gfx11<0x0b>;
+defm V_PK_MAX_U16     : VOP3P_Real_gfx10_gfx11<0x0c>;
+defm V_PK_MIN_U16     : VOP3P_Real_gfx10_gfx11<0x0d>;
+defm V_PK_FMA_F16     : VOP3P_Real_gfx10_gfx11<0x0e>;
+defm V_PK_ADD_F16     : VOP3P_Real_gfx10_gfx11<0x0f>;
+defm V_PK_MUL_F16     : VOP3P_Real_gfx10_gfx11<0x10>;
+defm V_PK_MIN_F16     : VOP3P_Real_gfx10_gfx11<0x11>;
+defm V_PK_MAX_F16     : VOP3P_Real_gfx10_gfx11<0x12>;
+defm V_FMA_MIX_F32    : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
+defm V_FMA_MIXLO_F16  : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
+defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
@@ -880,9 +1015,9 @@ defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
 
 let SubtargetPredicate = HasDot7Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
-defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x17>;
-defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x19>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
+defm V_DOT4_U32_U8  : VOP3P_Real_gfx10_gfx11 <0x17>;
+defm V_DOT8_U32_U4  : VOP3P_Real_gfx10_gfx11 <0x19>;
 
 } // End SubtargetPredicate = HasDot7Insts
 

diff  --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8413fd5181108..d82a5d3831659 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -316,7 +316,6 @@ class VOP3be <VOPProfile P> : Enc64 {
 
 class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<8> vdst;
-  // neg, neg_hi, op_sel put in srcN_modifiers
   bits<4> src0_modifiers;
   bits<9> src0;
   bits<4> src1_modifiers;
@@ -412,6 +411,8 @@ class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
   let Inst{31-23} = 0x198; //encoding
 }
 
+class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
+
 class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
   let Inst{25-17} = op;
 }
@@ -705,6 +706,39 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
 }
 
+class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
+  bits<4> src0_modifiers;
+  bits<4> src1_modifiers;
+  bits<4> src2_modifiers;
+  bits<1> clamp;
+
+  let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+  let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+  let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+  let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x198; // encoding
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
+class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+  bits<8> vdst;
+  bits<9> src1;
+  bits<9> src2;
+
+  let Inst{7-0} = vdst;
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+}
+
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
   InstSI <P.OutsDPP, Ins, OpName#asmOps, pattern>,
@@ -847,6 +881,25 @@ class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
   let Inst{95-92} = row_mask;
 }
 
+class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
+               dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+               string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
+  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3P_DPPe_Common<op, P>,
+  VOP3_DPPe_Fields {
+
+  let VOP3P = 1;
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = !if(IsDPP16, fi, ?);
+  let Inst{83}    = bound_ctrl;
+
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+}
+
 class VOP_DPP8e<VOPProfile P> : Enc64 {
   bits<8> src0;
   bits<24> dpp8;
@@ -905,6 +958,16 @@ class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
   let Inst{95-72} = dpp8{23-0};
 }
 
+class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
+  VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
+  VOP3_DPP8e_Fields {
+
+  let VOP3P = 1;
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+}
+
 def DPP8Mode {
   int FI_0 = 0xE9;
   int FI_1 = 0xEA;

diff  --git a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
index 9791f5019d47f..a55b847f44988 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_dpp.s
@@ -88,6 +88,23 @@ v_subrev_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[2,2,2,2,4,4,4,4]
 // W32: encoding: [0x05,0x6a,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x92,0x44,0x92]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error
 
+; VOP3P
+v_fma_mix_f32 v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+
+v_fma_mix_f32 v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1
+// GFX11: encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92]
+
+; For test purpose only. OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to all 1
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4]
+// GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
+
+v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+
 ; DPP
 
 ; VOP1->3
@@ -191,3 +208,16 @@ v_add_co_u32_e64_dpp v243, vcc, v243, v2 clamp quad_perm:[1,2,3,1] bank_mask: 0x
 // W64: encoding: [0xf3,0xea,0x00,0xd7,0xfa,0x04,0x02,0x00,0xf3,0x79,0x04,0xf5]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error
 
+
+; VOP3P
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
+
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+
+v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
+// GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
+
+v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0
+// GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]

diff  --git a/llvm/test/MC/AMDGPU/gfx11_err.s b/llvm/test/MC/AMDGPU/gfx11_err.s
index d8e1187ed79ae..d15935a6eca4a 100644
--- a/llvm/test/MC/AMDGPU/gfx11_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_err.s
@@ -44,3 +44,20 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
 
 v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// On GFX11, v_dot8_i32_i4 is a valid SP3 alias for v_dot8_i32_iu4.
+// However, we intentionally leave it unimplemented because on other
+// processors v_dot8_i32_i4 denotes an instruction of a 
diff erent
+// behaviour, which is considered potentially dangerous.
+v_dot8_i32_i4 v0, v1, v2, v3
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+// On GFX11, v_dot4_i32_i8 is a valid SP3 alias for v_dot4_i32_iu8.
+// However, we intentionally leave it unimplemented because on other
+// processors v_dot4_i32_i8 denotes an instruction of a 
diff erent
+// behaviour, which is considered potentially dangerous.
+v_dot4_i32_i8 v0, v1, v2, v3
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_dot4c_i32_i8 v0, v1, v2
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

diff  --git a/llvm/test/MC/AMDGPU/gfx11_vop3p.s b/llvm/test/MC/AMDGPU/gfx11_vop3p.s
new file mode 100644
index 0000000000000..dc56baa77e1fd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_vop3p.s
@@ -0,0 +1,213 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1100 %s | FileCheck --check-prefix=GFX11 %s
+
+v_pk_fma_f16 v8, v0, s0, v1 clamp
+// GFX11: encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_add_u16 v1, v2, v3 clamp
+// GFX11: encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2 clamp
+// GFX11: encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_mul_lo_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_add_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_sub_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_lshlrev_b16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_lshrrev_b16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_ashrrev_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_add_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_u16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_fma_f16 v0, v1, v2, v3
+// GFX11: encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_pk_add_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_mul_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_min_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18]
+
+v_pk_max_f16 v0, v1, v2
+// GFX11: encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18]
+
+//
+// Test op_sel/op_sel_hi
+//
+
+v_pk_add_u16 v1, v2, v3
+// GFX11: encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08]
+
+//
+// Test src2 op_sel/op_sel_hi
+//
+v_pk_fma_f16 v8, v0, s0, v1
+// GFX11: encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04]
+
+//
+// Test neg_lo/neg_hi
+//
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1]
+// GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+
+//
+// DOT
+//
+v_dot4_i32_iu8 v3, v4, v5, v6
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, v6                ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c]
+
+v_dot4_i32_iu8 v3, v4, v5, 0xf neg_lo:[1,1]
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a]
+
+v_dot4_u32_u8 v3, v4, v5, v6
+// GFX11: v_dot4_u32_u8 v3, v4, v5, v6            ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c]
+
+v_dot4_i32_iu8 v3, v4, v5, 0xf
+// GFX11: v_dot4_i32_iu8 v3, v4, v5, 15                ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a]
+
+v_dot8_i32_iu4 v3, v4, v5, 0xf neg_lo:[1,0]
+// GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a]
+
+v_dot8_i32_iu4 v3, v4, v5, v0 neg_lo:[0,0]
+// GFX11: v_dot8_i32_iu4 v3, v4, v5, v0                ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c]
+
+v_dot8_u32_u4 v0, v1, v2, v3
+// GFX11: v_dot8_u32_u4 v0, v1, v2, v3            ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_f16 v0, v1, v2, v3
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3                ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1]
+// GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c]
+
+v_dot2_f32_bf16 v0, v1, v2, v3
+// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3          ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1]
+// GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c]
+
+//
+// FMA_MIX
+//
+v_fma_mix_f32 v0, v1, v2, v3
+// GFX11: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04]
+
+v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04]
+
+v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3)
+// GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44]
+
+v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp
+// GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c]

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
index 3e61d3bb29a4f..875427aaaa9db 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt
@@ -14550,6 +14550,27 @@
 # GFX11: v_dot2acc_f32_f16 v5, v1, v2        ; encoding: [0x01,0x05,0x0a,0x04]
 0x01,0x05,0x0a,0x04
 
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, v6                ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x1a,0x1c
+
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15 neg_lo:[1,1,0] ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x7a
+
+# GFX11: v_dot4_i32_iu8 v3, v4, v5, 15                ; encoding: [0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a]
+0x03,0x40,0x16,0xcc,0x04,0x0b,0x3e,0x1a
+
+# GFX11: v_dot4_u32_u8 v3, v4, v5, v6            ; encoding: [0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c]
+0x03,0x40,0x17,0xcc,0x04,0x0b,0x1a,0x1c
+
+# GFX11: v_dot8_i32_iu4 v3, v4, v5, 15 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a]
+0x03,0x40,0x18,0xcc,0x04,0x0b,0x3e,0x3a
+
+# GFX11: v_dot8_i32_iu4 v3, v4, v5, v0                ; encoding: [0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c]
+0x03,0x40,0x18,0xcc,0x04,0x0b,0x02,0x1c
+
+# GFX11: v_dot8_u32_u4 v0, v1, v2, v3            ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c
+
 # GFX11: v_exp_f32_e32 v255, v1                  ; encoding: [0x01,0x4b,0xfe,0x7f]
 0x01,0x4b,0xfe,0x7f
 
@@ -23246,6 +23267,132 @@
 # GFX11: v_xor3_b32 v5, vcc_lo, v2, v3           ; encoding: [0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04]
 0x05,0x00,0x40,0xd6,0x6a,0x04,0x0e,0x04
 
+# GFX11: v_pk_add_f16 v0, v1, v2                 ; encoding: [0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0f,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_i16 v0, v1, v2                 ; encoding: [0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x02,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_u16 v0, v1, v2                 ; encoding: [0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0a,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 clamp           ; encoding: [0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0xc0,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3                 ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1]    ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x50,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0]    ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x48,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel:[1,1]    ; encoding: [0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18]
+0x01,0x58,0x0a,0xcc,0x02,0x07,0x02,0x18
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x00
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x10
+
+# GFX11: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08]
+0x01,0x40,0x0a,0xcc,0x02,0x07,0x02,0x08
+
+# GFX11: v_pk_ashrrev_i16 v0, v1, v2             ; encoding: [0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x06,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_fma_f16 v0, v1, v2, v3             ; encoding: [0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x0e,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 clamp       ; encoding: [0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0xc0,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1             ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x44,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x42,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x41,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c]
+0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0x1c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x9c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x5c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0x3c
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+0x08,0x40,0x0e,0xcc,0x00,0x01,0x04,0xfc
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc]
+0x08,0x47,0x0e,0xcc,0x00,0x01,0x04,0xfc
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04]
+0x08,0x60,0x0e,0xcc,0x00,0x01,0x04,0x04
+
+# GFX11: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04]
+0x08,0x00,0x0e,0xcc,0x00,0x01,0x04,0x04
+
+# GFX11: v_pk_lshlrev_b16 v0, v1, v2             ; encoding: [0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x04,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_lshrrev_b16 v0, v1, v2             ; encoding: [0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x05,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_f16 v0, v1, v2                 ; encoding: [0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x12,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_i16 v0, v1, v2                 ; encoding: [0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x07,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_max_u16 v0, v1, v2                 ; encoding: [0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0c,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_f16 v0, v1, v2                 ; encoding: [0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x11,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_i16 v0, v1, v2 clamp           ; encoding: [0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18]
+0x00,0xc0,0x08,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_i16 v0, v1, v2                 ; encoding: [0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x08,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_min_u16 v0, v1, v2                 ; encoding: [0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x0d,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_mul_f16 v0, v1, v2                 ; encoding: [0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x10,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_mul_lo_u16 v0, v1, v2              ; encoding: [0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x01,0xcc,0x01,0x05,0x02,0x18
+
+# GFX11: v_pk_sub_i16 v0, v1, v2                 ; encoding: [0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18]
+0x00,0x40,0x03,0xcc,0x01,0x05,0x02,0x18
+
 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00]
 # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00]
 0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00
@@ -24560,6 +24707,69 @@
 # GFX11: v_xnor_b32_e64_dpp v8, v5, v2 quad_perm:[1,0,2,3] row_mask:0x1 bank_mask:0x0 ; encoding: [0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10]
 0x08,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x05,0xe1,0x00,0x10
 
+# GFX11: v_dot2_f32_f16 v0, v1, v2, v3           ; encoding: [0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x13,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c]
+0x00,0x45,0x13,0xcc,0x01,0x05,0x0e,0x7c
+
+# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3          ; encoding: [0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x1a,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_dot2_f32_bf16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,0,1] ; encoding: [0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c]
+0x00,0x45,0x1a,0xcc,0x01,0x05,0x0e,0x3c
+
+# GFX11: v_fma_mix_f32 v0, v1, v2, v3            ; encoding: [0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04]
+0x00,0x00,0x20,0xcc,0x01,0x05,0x0e,0x04
+
+# GFX11: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04]
+0x00,0x20,0x20,0xcc,0x01,0x05,0x0e,0x04
+
+# GFX11: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp ; encoding: [0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0xc0,0x22,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX11: v_fma_mixlo_f16 v0, |v1|, -v2, |v3|     ; encoding: [0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44]
+0x00,0x05,0x21,0xcc,0x01,0x05,0x0e,0x44
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
+0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00]
+0xfa,0x04,0x0a,0x04,0x01,0xe4,0x04,0x00
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05
+
+# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05]
+0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
+0x00,0x00,0x20,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
+0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
+
+# GFX11: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]
+0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f
+
+# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92]
+0x00,0x05,0x21,0xcc,0xe9,0x04,0x0e,0x44,0x01,0x92,0x44,0x92
+
+# GFX11: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
+0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92
+
 # GFX11: v_permlane64_b32 v5, v1                 ; encoding: [0x01,0xcf,0x0a,0x7e]
 0x01,0xcf,0x0a,0x7e