[lldb] [clang] [compiler-rt] [flang] [lld] [llvm] [libcxx] [libc] [clang-tools-extra] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions (PR #77892)
Mariusz Sikora via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 18 02:07:20 PST 2024
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/77892
>From 628a3d2b42cdcbd903e0830ab7d631ea7dc422b9 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 10 Jan 2024 12:17:58 +0100
Subject: [PATCH 1/2] AMDGPU/GFX12: Add new dot4 fp8/bf8 instructions
Endoding is VOP3P. Tagged as deep/machine learning instructions.
i32 type (v4fp8 or v4bf8 packed in i32) is used for src0 and src1.
src0 and src1 have no src_modifiers. src2 is f32 and has src_modifiers:
f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +
.../builtins-amdgcn-dl-insts-err.cl | 5 +
.../builtins-amdgcn-dl-insts-gfx12.cl | 20 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 ++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 +
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 46 ++++
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 17 +-
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 47 ++++
llvm/lib/Target/AMDGPU/VOPInstructions.td | 13 +-
.../CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll | 255 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s | 120 +++++++++
llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s | 24 ++
.../MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s | 24 ++
llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s | 24 ++
.../test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s | 27 ++
llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s | 133 +++++++++
.../Disassembler/AMDGPU/gfx12_dasm_vop3p.txt | 120 +++++++++
.../AMDGPU/gfx12_dasm_vop3p_dpp16.txt | 24 ++
.../AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 24 ++
19 files changed, 938 insertions(+), 12 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..1c1b9b2c9e9e8c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts")
//===----------------------------------------------------------------------===//
// GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 6573325150d958..1be47f71276208 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -49,4 +49,9 @@ kernel void builtins_amdgcn_dl_insts_err(
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
+
+ fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}}
+ fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}}
+ fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}}
+ fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
new file mode 100644
index 00000000000000..31e10c0a5dc18c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned int uint;
+
+// CHECK-LABEL: @builtins_amdgcn_dl_insts
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %uiA, i32 %uiB, float %fC)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void builtins_amdgcn_dl_insts_err(global float *fOut,
+ uint uiA, uint uiB, float fC) {
+ fOut[0] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);
+ fOut[1] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);
+ fOut[2] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);
+ fOut[3] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 255f5106e543fb..1ec695a8164d8b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2696,6 +2696,25 @@ def int_amdgcn_udot8 :
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
>;
+// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c)
+// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
+class AMDGPU8bitFloatDot4Intrinsic :
+ ClangBuiltin<!subst("int", "__builtin", NAME)>,
+ DefaultAttrsIntrinsic<
+ [llvm_float_ty], // %r
+ [
+ llvm_i32_ty, // %a
+ llvm_i32_ty, // %b
+ llvm_float_ty, // %c
+ ],
+ [IntrNoMem, IntrSpeculatable]
+ >;
+
+def int_amdgcn_fdot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_fdot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+
//===----------------------------------------------------------------------===//
// gfx908 intrinsics
// ===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index d42496ef09ee8f..0fbe9a2dff2aba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4471,6 +4471,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:
+ case Intrinsic::amdgcn_fdot4_f32_fp8_bf8:
+ case Intrinsic::amdgcn_fdot4_f32_bf8_fp8:
+ case Intrinsic::amdgcn_fdot4_f32_fp8_fp8:
+ case Intrinsic::amdgcn_fdot4_f32_bf8_bf8:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ba79affe683d6f..bd68054589b112 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1688,6 +1688,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
+ bool validateNeg(const MCInst &Inst, int OpName);
bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
bool validateVccOperand(unsigned Reg) const;
bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
@@ -4357,6 +4358,41 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
+ assert(OpName == AMDGPU::OpName::neg_lo || OpName == AMDGPU::OpName::neg_hi);
+
+ const unsigned Opc = Inst.getOpcode();
+ uint64_t TSFlags = MII.get(Opc).TSFlags;
+
+ // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
+ if (!(TSFlags & SIInstrFlags::IsDOT))
+ return true;
+
+ int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ if (NegIdx == -1)
+ return true;
+
+ unsigned Neg = Inst.getOperand(NegIdx).getImm();
+
+ // Instructions that have neg_lo or neg_hi operand but neg modifier is allowed
+ // on some src operands but not allowed on other.
+ // It is convenient that such instructions don't have src_modifiers operand
+ // for src operands that don't allow neg because they also don't allow opsel.
+
+ int SrcMods[3] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+
+ for (unsigned i = 0; i < 3; ++i) {
+ if (!AMDGPU::hasNamedOperand(Opc, SrcMods[i])) {
+ if (Neg & (1 << i))
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
@@ -4834,6 +4870,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid op_sel operand");
return false;
}
+ if (!validateNeg(Inst, AMDGPU::OpName::neg_lo)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyNegLo, Operands),
+ "invalid neg_lo operand");
+ return false;
+ }
+ if (!validateNeg(Inst, AMDGPU::OpName::neg_hi)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyNegHi, Operands),
+ "invalid neg_hi operand");
+ return false;
+ }
if (!validateDPP(Inst, Operands)) {
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 6c7977e22599c6..e73e53aa270f91 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1260,14 +1260,19 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
int NumOps = 0;
int Ops[3];
- for (int OpName : { AMDGPU::OpName::src0_modifiers,
- AMDGPU::OpName::src1_modifiers,
- AMDGPU::OpName::src2_modifiers }) {
- int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
- if (Idx == -1)
+ std::pair<int, int> MOps[] = {
+ {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src0},
+ {AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src1},
+ {AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::src2}};
+ int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+ for (auto [SrcMod, Src] : MOps) {
+ if (!AMDGPU::hasNamedOperand(Opc, Src))
break;
- Ops[NumOps++] = MI->getOperand(Idx).getImm();
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, SrcMod);
+ Ops[NumOps++] =
+ (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
const bool HasDstSel =
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 17921dcddc0e2a..faa00dff5ff00c 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -443,6 +443,48 @@ def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
>;
} // End SubtargetPredicate = HasDot8Insts
+// Does not use opsel, no src_modifiers on src0 and src1.
+// src_modifiers on src2(f32) are f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
+def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
+ VOP3_PACKED, 1> {
+ let HasClamp = 0;
+ let HasOpSel = 0;
+ let HasOMod = 0;
+ let IsDOT = 1;
+ let HasSrc0Mods = 0;
+ let HasSrc1Mods = 0;
+ let HasSrc2Mods = 1;
+
+ let InsVOP3P = (ins VSrc_b32:$src0, VSrc_b32:$src1,
+ PackedF16InputMods:$src2_modifiers, VSrc_f32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+
+ let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+ PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, FI:$fi);
+
+ let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+ PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
+ row_mask:$row_mask, bank_mask:$bank_mask,
+ bound_ctrl:$bound_ctrl, FI:$fi);
+}
+
+multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
+ defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ def : GCNPat <(intrinsic_node i32:$src0, i32:$src1,
+ (VOP3Mods f32:$src2, i32:$src2_modifiers)),
+ (!cast<Instruction>(NAME) i32:$src0, i32:$src1,
+ i32:$src2_modifiers, f32:$src2)>;
+}
+
+defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_fdot4_f32_fp8_bf8>;
+defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_fdot4_f32_bf8_fp8>;
+defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_fdot4_f32_fp8_fp8>;
+defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_fdot4_f32_bf8_bf8>;
+
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
@@ -1019,6 +1061,11 @@ defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_m
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
+defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>;
+defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>;
+defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>;
+defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>;
+
//===----------------------------------------------------------------------===//
// GFX11
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c4b9e706309374..a10a2cfae018c3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -169,6 +169,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<opName, P, pattern, 1> {
let VOP3P = 1;
+ let IsDOT = P.IsDOT;
}
class VOP_Real<VOP_Pseudo ps> {
@@ -387,7 +388,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -396,8 +397,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -772,12 +773,12 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
let Inst{22-16} = op;
let Inst{31-23} = 0x198; // encoding
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
new file mode 100644
index 00000000000000..97371f169c07ff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %fneg.fabs.c = fneg float %fabs.c
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fneg_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %fneg.fabs.c = fneg float %fabs.c
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fabs_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fneg_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %fneg.fabs.c = fneg float %fabs.c
+ %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fabs_fneg:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fneg.c = fneg float %c
+ %fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
+ ret float %ret
+}
+
+define float @test_amdgcn_fdot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fneg_fabs:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %fabs.c = call float @llvm.fabs.f32(float %c)
+ %fneg.fabs.c = fneg float %fabs.c
+ %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
+ ret float %ret
+}
+
+declare float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+
+declare float @llvm.fabs.f32(float %a)
+
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
index a8347fb7f08bd6..567fc61d0b932d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p.s
@@ -1343,3 +1343,123 @@ v_pk_maximum_f16 v5, src_scc, vcc_lo op_sel:[1,0] op_sel_hi:[0,1] neg_lo:[0,0] n
v_pk_maximum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp
// GFX12: [0xff,0xd3,0x1e,0xcc,0xff,0xd6,0x00,0x68,0x0b,0xfe,0x00,0x00]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1]
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x9c]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1]
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x24,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, s0, v2, v3
+// GFX12: v_dot4_f32_fp8_bf8 v0, s0, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x00,0x04,0x0e,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, v1, s0, v3
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, s0, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x01,0x0c,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, s0
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, s0 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x02,0x18]
+
+v_dot4_f32_fp8_bf8 v0, 1.0, v2, v3
+// GFX12: v_dot4_f32_fp8_bf8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0xf2,0x04,0x0e,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, v1, 1.0, v3
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0xe5,0x0d,0x1c]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, 1.0
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0xca,0x1b]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, 1
+// GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x06,0x1a]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1]
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x9c]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1]
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x25,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, s0, v2, v3
+// GFX12: v_dot4_f32_bf8_fp8 v0, s0, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x00,0x04,0x0e,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, v1, s0, v3
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, s0, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x01,0x0c,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, s0
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, s0 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x02,0x18]
+
+v_dot4_f32_bf8_fp8 v0, 1.0, v2, v3
+// GFX12: v_dot4_f32_bf8_fp8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0xf2,0x04,0x0e,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, v1, 1.0, v3
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0xe5,0x0d,0x1c]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, 1.0
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0xca,0x1b]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, 1
+// GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x06,0x1a]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1]
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x9c]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1]
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x26,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, s0, v2, v3
+// GFX12: v_dot4_f32_fp8_fp8 v0, s0, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x00,0x04,0x0e,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, v1, s0, v3
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, s0, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x01,0x0c,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, s0
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, s0 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x02,0x18]
+
+v_dot4_f32_fp8_fp8 v0, 1.0, v2, v3
+// GFX12: v_dot4_f32_fp8_fp8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0xf2,0x04,0x0e,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, v1, 1.0, v3
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0xe5,0x0d,0x1c]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, 1.0
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0xca,0x1b]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, 1
+// GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x06,0x1a]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1]
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x9c]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1]
+// GFX12: _dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x27,0xcc,0x01,0x05,0x0e,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, s0, v2, v3
+// GFX12: v_dot4_f32_bf8_bf8 v0, s0, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x00,0x04,0x0e,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, v1, s0, v3
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, s0, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x01,0x0c,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, s0
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, s0 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x02,0x18]
+
+v_dot4_f32_bf8_bf8 v0, 1.0, v2, v3
+// GFX12: v_dot4_f32_bf8_bf8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0xf2,0x04,0x0e,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, v1, 1.0, v3
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0xe5,0x0d,0x1c]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, 1.0
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0xca,0x1b]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, 1
+// GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x06,0x1a]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
index 3b4bbff1efeb11..75bd1696e10bb7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
@@ -12,3 +12,27 @@ v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0
// GFX12: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1b,0x00,0xff]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 row_shr:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 row_shr:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1f,0x0d,0x11]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 row_shl:15
+// GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x0f,0x01,0xff]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 row_ror:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 row_ror:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x2f,0x0d,0x11]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 row_mirror
+// GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x40,0x01,0xff]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 row_half_mirror row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 row_half_mirror row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x41,0x0d,0x11]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 row_share:15
+// GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 row_share:15 row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x5f,0x01,0xff]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 row_xmask:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 row_xmask:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x6f,0x0d,0x11]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
new file mode 100644
index 00000000000000..b76754123207c0
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16_err.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error: %s
+
+// check for error with sgpr or imm operands
+
+v_dot4_f32_fp8_bf8 v0, s0, v2, v3 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_bf8 v0, v1, s0, v3 row_shr:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, s0 row_shl:15
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_fp8 v0, 1.0, v2, v3 row_ror:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_fp8 v0, v1, 1.0, v3 row_mirror
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, 1.0 row_half_mirror row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, 1 row_share:15
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
index 71eea18bc13dc1..14cf169d4b4249 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
@@ -16,3 +16,27 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:
v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
// GFX12: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1
+// GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x24,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x25,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1
+// GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x25,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x26,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1
+// GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x26,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x27,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1
+// GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x27,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
new file mode 100644
index 00000000000000..50d3b6aca41bc7
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8_err.s
@@ -0,0 +1,27 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error: %s
+
+// check for error with sgpr or imm operands
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] row_mask:0x1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_bf8 v0, s0, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_fp8 v0, v1, s0, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, s0 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_fp8 v0, 1.0, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_fp8 v0, v1, 1.0, v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, 1.0 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, 1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s
new file mode 100644
index 00000000000000..269989a26e287f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_err.s
@@ -0,0 +1,133 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error: %s
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 clamp
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel_hi:[1,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 op_sel_hi:[1,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_lo:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_lo:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_hi:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_hi:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 clamp
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel_hi:[1,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 op_sel_hi:[1,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_lo:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_lo:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_hi:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_hi:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 clamp
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel_hi:[1,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 op_sel_hi:[1,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_lo:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_lo:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_hi:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_hi:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 clamp
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel:[0,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel_hi:[0,1,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel_hi:[1,0,1]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 op_sel_hi:[1,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_lo:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_lo:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_hi:[1,0,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_hi:[0,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
index 44d8995c5c4361..0f6debcbd9e304 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p.txt
@@ -1341,3 +1341,123 @@
# GFX12: v_pk_minimum_f16 v5, vcc_lo, ttmp15 ; encoding: [0x05,0x40,0x1d,0xcc,0x6a,0xf6,0x00,0x18]
0x05,0x40,0x1d,0xcc,0x6a,0xf6,0x00,0x18
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x9c]
+0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x9c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x24,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x44,0x24,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, s1, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x04,0x0e,0x1c]
+0x00,0x40,0x24,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, s2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0c,0x1c]
+0x00,0x40,0x24,0xcc,0x01,0x05,0x0c,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, s3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x18]
+0x00,0x40,0x24,0xcc,0x01,0x05,0x0e,0x18
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x24,0xcc,0xf2,0x04,0x0e,0x1c]
+0x00,0x40,0x24,0xcc,0xf2,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0xe5,0x0d,0x1c]
+0x00,0x40,0x24,0xcc,0x01,0xe5,0x0d,0x1c
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0xca,0x1b]
+0x00,0x40,0x24,0xcc,0x01,0x05,0xca,0x1b
+
+# GFX12: v_dot4_f32_fp8_bf8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x24,0xcc,0x01,0x05,0x06,0x1a]
+0x00,0x40,0x24,0xcc,0x01,0x05,0x06,0x1a
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x9c]
+0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x9c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x25,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x44,0x25,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, s1, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x04,0x0e,0x1c]
+0x00,0x40,0x25,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, s2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0c,0x1c]
+0x00,0x40,0x25,0xcc,0x01,0x05,0x0c,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, s3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x18]
+0x00,0x40,0x25,0xcc,0x01,0x05,0x0e,0x18
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x25,0xcc,0xf2,0x04,0x0e,0x1c]
+0x00,0x40,0x25,0xcc,0xf2,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0xe5,0x0d,0x1c]
+0x00,0x40,0x25,0xcc,0x01,0xe5,0x0d,0x1c
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0xca,0x1b]
+0x00,0x40,0x25,0xcc,0x01,0x05,0xca,0x1b
+
+# GFX12: v_dot4_f32_bf8_fp8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x25,0xcc,0x01,0x05,0x06,0x1a]
+0x00,0x40,0x25,0xcc,0x01,0x05,0x06,0x1a
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x9c]
+0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x9c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x26,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x44,0x26,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, s1, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x04,0x0e,0x1c]
+0x00,0x40,0x26,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, s2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0c,0x1c]
+0x00,0x40,0x26,0xcc,0x01,0x05,0x0c,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, s3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x18]
+0x00,0x40,0x26,0xcc,0x01,0x05,0x0e,0x18
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x26,0xcc,0xf2,0x04,0x0e,0x1c]
+0x00,0x40,0x26,0xcc,0xf2,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0xe5,0x0d,0x1c]
+0x00,0x40,0x26,0xcc,0x01,0xe5,0x0d,0x1c
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0xca,0x1b]
+0x00,0x40,0x26,0xcc,0x01,0x05,0xca,0x1b
+
+# GFX12: v_dot4_f32_fp8_fp8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x26,0xcc,0x01,0x05,0x06,0x1a]
+0x00,0x40,0x26,0xcc,0x01,0x05,0x06,0x1a
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_lo:[0,0,1] ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x9c]
+0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x9c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, v3 neg_hi:[0,0,1] ; encoding: [0x00,0x44,0x27,0xcc,0x01,0x05,0x0e,0x1c]
+0x00,0x44,0x27,0xcc,0x01,0x05,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, s1, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x04,0x0e,0x1c]
+0x00,0x40,0x27,0xcc,0x01,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, s2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0c,0x1c]
+0x00,0x40,0x27,0xcc,0x01,0x05,0x0c,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, s3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x18]
+0x00,0x40,0x27,0xcc,0x01,0x05,0x0e,0x18
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, 1.0, v2, v3 ; encoding: [0x00,0x40,0x27,0xcc,0xf2,0x04,0x0e,0x1c]
+0x00,0x40,0x27,0xcc,0xf2,0x04,0x0e,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, 1.0, v3 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0xe5,0x0d,0x1c]
+0x00,0x40,0x27,0xcc,0x01,0xe5,0x0d,0x1c
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, 1.0 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0xca,0x1b]
+0x00,0x40,0x27,0xcc,0x01,0x05,0xca,0x1b
+
+# GFX12: v_dot4_f32_bf8_bf8 v0, v1, v2, 1 ; encoding: [0x00,0x40,0x27,0xcc,0x01,0x05,0x06,0x1a]
+0x00,0x40,0x27,0xcc,0x01,0x05,0x06,0x1a
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
index 2b902878b87f66..52fd0530681cf1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
@@ -12,3 +12,27 @@
# GFX12: v_fma_mixhi_f16_e64_dpp v0, v1, v2, v3 op_sel_hi:[1,1,1] clamp quad_perm:[0,2,3,1] row_mask:0x0 bank_mask:0xf ; encoding: [0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f]
0x00,0xc0,0x22,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x78,0x00,0x0f
+
+# GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1b,0x00,0xff]
+0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1b,0x00,0xff
+
+# GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 row_shr:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1f,0x0d,0x11]
+0x00,0x40,0x24,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x1f,0x0d,0x11
+
+# GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x0f,0x01,0xff]
+0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x0f,0x01,0xff
+
+# GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 row_ror:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x2f,0x0d,0x11]
+0x00,0x40,0x25,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x2f,0x0d,0x11
+
+# GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x40,0x01,0xff]
+0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x40,0x01,0xff
+
+# GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 row_half_mirror row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x41,0x0d,0x11]
+0x00,0x40,0x26,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x41,0x0d,0x11
+
+# GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 row_share:15 row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x5f,0x01,0xff]
+0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x5f,0x01,0xff
+
+# GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 row_xmask:15 row_mask:0x1 bank_mask:0x1 bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x6f,0x0d,0x11]
+0x00,0x40,0x27,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x6f,0x0d,0x11
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
index 474db1bdcc0e20..688212e51c427d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
@@ -15,3 +15,27 @@
# GFX12: v_fma_mixlo_f16_e64_dpp v0, |v1|, -v2, |v3| op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:[2,2,2,2,4,4,4,4] ; encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92
+
+# GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x24,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x24,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x25,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x25,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_bf8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x25,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x25,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x26,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x26,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_fp8_fp8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x26,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x26,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x27,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x27,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_dot4_f32_bf8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0x00,0x40,0x27,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
+0x00,0x40,0x27,0xcc,0xea,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa
>From 62d65a1e619c1327b78433f6a2bf5fd025157277 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Tue, 16 Jan 2024 09:05:41 +0100
Subject: [PATCH 2/2] Rename fdot4 to dot4
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 8 +-
.../builtins-amdgcn-dl-insts-err.cl | 8 +-
.../builtins-amdgcn-dl-insts-gfx12.cl | 16 +--
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 8 +-
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 8 +-
...n.fdot4.f32.ll => llvm.amdgcn.dot4.f32.ll} | 128 +++++++++---------
7 files changed, 92 insertions(+), 92 deletions(-)
rename llvm/test/CodeGen/AMDGPU/{llvm.amdgcn.fdot4.f32.ll => llvm.amdgcn.dot4.f32.ll} (55%)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1c1b9b2c9e9e8c..60036ca089726b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,10 +255,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_bf8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_fp8, "fUiUif", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", "gfx12-insts")
//===----------------------------------------------------------------------===//
// GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 1be47f71276208..f5317683d0ff97 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -50,8 +50,8 @@ kernel void builtins_amdgcn_dl_insts_err(
iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}}
- fOut[5] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_bf8' needs target feature gfx12-insts}}
- fOut[6] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_fp8' needs target feature gfx12-insts}}
- fOut[7] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_fp8_fp8' needs target feature gfx12-insts}}
- fOut[8] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_fdot4_f32_bf8_bf8' needs target feature gfx12-insts}}
+ fOut[5] = __builtin_amdgcn_dot4_f32_fp8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_dot4_f32_fp8_bf8' needs target feature gfx12-insts}}
+ fOut[6] = __builtin_amdgcn_dot4_f32_bf8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_dot4_f32_bf8_fp8' needs target feature gfx12-insts}}
+ fOut[7] = __builtin_amdgcn_dot4_f32_fp8_fp8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_dot4_f32_fp8_fp8' needs target feature gfx12-insts}}
+ fOut[8] = __builtin_amdgcn_dot4_f32_bf8_bf8(uiA, uiB, fC); // expected-error {{'__builtin_amdgcn_dot4_f32_bf8_bf8' needs target feature gfx12-insts}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
index 31e10c0a5dc18c..087883e9f56089 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx12.cl
@@ -5,16 +5,16 @@
typedef unsigned int uint;
// CHECK-LABEL: @builtins_amdgcn_dl_insts
-// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
-// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %uiA, i32 %uiB, float %fC)
-// CHECK: call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %uiA, i32 %uiB, float %fC)
-// CHECK: call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %uiA, i32 %uiB, float %fC)
+// CHECK: call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %uiA, i32 %uiB, float %fC)
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void builtins_amdgcn_dl_insts_err(global float *fOut,
uint uiA, uint uiB, float fC) {
- fOut[0] = __builtin_amdgcn_fdot4_f32_fp8_bf8(uiA, uiB, fC);
- fOut[1] = __builtin_amdgcn_fdot4_f32_bf8_fp8(uiA, uiB, fC);
- fOut[2] = __builtin_amdgcn_fdot4_f32_fp8_fp8(uiA, uiB, fC);
- fOut[3] = __builtin_amdgcn_fdot4_f32_bf8_bf8(uiA, uiB, fC);
+ fOut[0] = __builtin_amdgcn_dot4_f32_fp8_bf8(uiA, uiB, fC);
+ fOut[1] = __builtin_amdgcn_dot4_f32_bf8_fp8(uiA, uiB, fC);
+ fOut[2] = __builtin_amdgcn_dot4_f32_fp8_fp8(uiA, uiB, fC);
+ fOut[3] = __builtin_amdgcn_dot4_f32_bf8_bf8(uiA, uiB, fC);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1ec695a8164d8b..1a5c0f90e9bcab 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2710,10 +2710,10 @@ class AMDGPU8bitFloatDot4Intrinsic :
[IntrNoMem, IntrSpeculatable]
>;
-def int_amdgcn_fdot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
-def int_amdgcn_fdot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
-def int_amdgcn_fdot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
-def int_amdgcn_fdot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
+def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
//===----------------------------------------------------------------------===//
// gfx908 intrinsics
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0fbe9a2dff2aba..579ab0f7040c39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4471,10 +4471,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:
- case Intrinsic::amdgcn_fdot4_f32_fp8_bf8:
- case Intrinsic::amdgcn_fdot4_f32_bf8_fp8:
- case Intrinsic::amdgcn_fdot4_f32_fp8_fp8:
- case Intrinsic::amdgcn_fdot4_f32_bf8_bf8:
+ case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
+ case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
+ case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
+ case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index faa00dff5ff00c..587e0ff673f9c6 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -480,10 +480,10 @@ multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
i32:$src2_modifiers, f32:$src2)>;
}
-defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_fdot4_f32_fp8_bf8>;
-defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_fdot4_f32_bf8_fp8>;
-defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_fdot4_f32_fp8_fp8>;
-defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_fdot4_f32_bf8_bf8>;
+defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_dot4_f32_fp8_bf8>;
+defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_dot4_f32_bf8_fp8>;
+defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_dot4_f32_fp8_fp8>;
+defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_dot4_f32_bf8_bf8>;
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
similarity index 55%
rename from llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
rename to llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index 97371f169c07ff..f4a7b2024b5064 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -2,43 +2,43 @@
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
-define float @test_amdgcn_fdot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8:
+define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs:
+define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg:
+define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg:
+define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
@@ -46,12 +46,12 @@ define float @test_amdgcn_fdot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c)
entry:
%fneg.c = fneg float %c
%fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs:
+define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
@@ -59,47 +59,47 @@ define float @test_amdgcn_fdot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c)
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
%fneg.fabs.c = fneg float %fabs.c
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8:
+define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fabs:
+define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fneg:
+define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg:
+define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
@@ -107,12 +107,12 @@ define float @test_amdgcn_fdot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c)
entry:
%fneg.c = fneg float %c
%fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_fp8_fneg_fabs:
+define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
@@ -120,47 +120,47 @@ define float @test_amdgcn_fdot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c)
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
%fneg.fabs.c = fneg float %fabs.c
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8:
+define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fabs:
+define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fneg:
+define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fabs_fneg:
+define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
@@ -168,12 +168,12 @@ define float @test_amdgcn_fdot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c)
entry:
%fneg.c = fneg float %c
%fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fabs.fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_fp8_fp8_fneg_fabs:
+define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
@@ -181,47 +181,47 @@ define float @test_amdgcn_fdot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c)
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
%fneg.fabs.c = fneg float %fabs.c
- %ret = call float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %fneg.fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8:
+define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fabs:
+define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fneg:
+define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fabs_fneg:
+define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
@@ -229,12 +229,12 @@ define float @test_amdgcn_fdot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c)
entry:
%fneg.c = fneg float %c
%fabs.fneg.c = call float @llvm.fabs.f32(float %fneg.c)
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fabs.fneg.c)
ret float %ret
}
-define float @test_amdgcn_fdot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
-; GFX12-LABEL: test_amdgcn_fdot4_f32_bf8_bf8_fneg_fabs:
+define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
+; GFX12-LABEL: test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
@@ -242,14 +242,14 @@ define float @test_amdgcn_fdot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c)
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
%fneg.fabs.c = fneg float %fabs.c
- %ret = call float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
+ %ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %fneg.fabs.c)
ret float %ret
}
-declare float @llvm.amdgcn.fdot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
-declare float @llvm.amdgcn.fdot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
-declare float @llvm.amdgcn.fdot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
-declare float @llvm.amdgcn.fdot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
+declare float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
declare float @llvm.fabs.f32(float %a)
More information about the cfe-commits
mailing list