[clang] 9fa5a6b - [AMDGPU] Support for gfx940 fp8 conversions
Stanislav Mekhanoshin via cfe-commits
cfe-commits at lists.llvm.org
Mon Jul 18 11:48:59 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-07-18T11:48:43-07:00
New Revision: 9fa5a6b7e8a292ec91b844a622836d2990ef5796
URL: https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796
DIFF: https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796.diff
LOG: [AMDGPU] Support for gfx940 fp8 conversions
Differential Revision: https://reviews.llvm.org/D129902
Added:
clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/Basic/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/MC/AMDGPU/gfx940_asm_features.s
llvm/test/MC/AMDGPU/gfx940_err.s
llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 68bcf546d177c..e9f25d783e596 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,5 +346,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi",
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 50256d8e210c9..80f2601b0a245 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -250,6 +250,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
break;
case GK_GFX940:
Features["gfx940-insts"] = true;
+ Features["fp8-insts"] = true;
LLVM_FALLTHROUGH;
case GK_GFX90A:
Features["gfx90a-insts"] = true;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index cb3a3eff01f70..ff288e530d17f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -64,7 +64,7 @@
// GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
// GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX940: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
+// GFX940: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
new file mode 100644
index 0000000000000..56d757012a5e7
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -0,0 +1,60 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+
+typedef float v2f __attribute__((ext_vector_type(2)));
+
+// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
+// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+void test_cvt_f32_bf8(global int* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
+// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+void test_cvt_f32_fp8(global int* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
+// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+void test_cvt_pk_f32_bf8(global v2f* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
+// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+void test_cvt_pk_f32_fp8(global v2f* out, int a)
+{
+ *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
+void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
+{
+ *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
+void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
+{
+ *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
+void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
+{
+ *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
+void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
+{
+ *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e81224d9b8906..2936e495bb2df 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2320,6 +2320,58 @@ def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
+// byte_sel selects byte from srcA.
+def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
+ Intrinsic<[llvm_float_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
+def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
+ Intrinsic<[llvm_float_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
+// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
+def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
+ Intrinsic<[llvm_v2f32_ty],
+ [llvm_i32_ty, llvm_i1_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
+def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
+ Intrinsic<[llvm_v2f32_ty],
+ [llvm_i32_ty, llvm_i1_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
+// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
+def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
+ Intrinsic<[llvm_i32_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
+def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
+ Intrinsic<[llvm_i32_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
+// byte_sel selects byte to write into vdst.
+def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
+ Intrinsic<[llvm_i32_ty],
+ [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
+def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
+ Intrinsic<[llvm_i32_ty],
+ [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 48b5814cd4820..49471478286f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -585,6 +585,12 @@ def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"Has mAI instructions"
>;
+def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
+ "HasFP8Insts",
+ "true",
+ "Has fp8 and bf8 instructions"
+>;
+
def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"HasPkFmacF16Inst",
"true",
@@ -1124,6 +1130,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
Feature64BitDPP,
FeaturePackedFP32Ops,
FeatureMAIInsts,
+ FeatureFP8Insts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
@@ -1704,6 +1711,9 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
+def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
+ AssemblerPredicate<(all_of FeatureFP8Insts)>;
+
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 8b2e51f836fc5..0a5feca0289c5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8257,6 +8257,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
+ if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
+ Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+ Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
+ Inst.addOperand(Inst.getOperand(0));
+ }
+
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
@@ -9061,12 +9067,27 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
// v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
switch (BasicInstType) {
case SIInstrFlags::VOP1:
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::clamp) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyClampSI, 0);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyOModSI, 0);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::dst_sel) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+ }
+ if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+ AMDGPU::OpName::dst_unused) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTySdwaDstUnused,
+ DstUnused::UNUSED_PRESERVE);
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
break;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d269d0945f3b1..8352337bb20a7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -145,6 +145,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDot7Insts = false;
bool HasDot8Insts = false;
bool HasMAIInsts = false;
+ bool HasFP8Insts = false;
bool HasPkFmacF16Inst = false;
bool HasAtomicFaddRtnInsts = false;
bool HasAtomicFaddNoRtnInsts = false;
@@ -721,6 +722,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasMAIInsts;
}
+ bool hasFP8Insts() const {
+ return HasFP8Insts;
+ }
+
bool hasPkFmacF16Inst() const {
return HasPkFmacF16Inst;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23afd6556bc9b..ffbd08514deba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -324,7 +324,8 @@ class isFloatType<ValueType SrcVT> {
// XXX - do v2i16 instructions?
class isIntType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, i16.Value),
+ bit ret = !or(!eq(SrcVT.Value, i8.Value),
+ !eq(SrcVT.Value, i16.Value),
!eq(SrcVT.Value, i32.Value),
!eq(SrcVT.Value, i64.Value),
!eq(SrcVT.Value, v4i16.Value),
@@ -1411,6 +1412,10 @@ class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+def Bin32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32> {
+ let Name = "SDWAWithBin32InputMods";
+ let ParserMethod = "parseRegOrImm";
+}
class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
InputMods <matchClass> {
@@ -1419,6 +1424,7 @@ class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+def Bin32SDWAInputMods : IntSDWAInputMods<Bin32SDWAInputModsMatchClass>;
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1d374a9f90ba9..73e4eb8cdc240 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -499,6 +499,59 @@ let SubtargetPredicate = isGFX9Only in {
defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
} // End SubtargetPredicate = isGFX9Only
+class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+ let HasExtSDWA = 1;
+ let HasExtSDWA9 = 1;
+ let HasExt = 1;
+ let DstRCSDWA = getVALUDstForVT<vt>.ret;
+ let InsSDWA = (ins Bin32SDWAInputMods:$src0_modifiers, Src0SDWA:$src0,
+ clampmod:$clamp, omod:$omod, src0_sel:$src0_sel);
+ let AsmSDWA = "$vdst, $src0_modifiers$clamp$omod $src0_sel"; // No dst_sel
+ let AsmSDWA9 = AsmSDWA;
+ let EmitDstSel = 0;
+}
+
+def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 <f32>;
+def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
+
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+ SchedRW = [WriteFloatCvt] in {
+ defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
+ defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
+ defm V_CVT_PK_F32_FP8 : VOP1Inst<"v_cvt_pk_f32_fp8", VOPProfileCVT_PK_F32_F8>;
+ defm V_CVT_PK_F32_BF8 : VOP1Inst<"v_cvt_pk_f32_bf8", VOPProfileCVT_PK_F32_F8>;
+}
+
+class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
+ VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+ (f32 (node i32:$src, index)),
+ !if (index,
+ (inst_sdwa 0, $src, 0, 0, index),
+ (inst_e32 $src))
+>;
+
+foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
+ V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
+ V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+}
+
+class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
+ VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+ (v2f32 (node i32:$src, index)),
+ !if (index,
+ (inst_sdwa 0, $src, 0, 0, SDWA.WORD_1),
+ (inst_e32 $src))
+>;
+
+foreach Index = [0, -1] in {
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+ V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+ def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+ V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+}
+
let SubtargetPredicate = isGFX10Plus in {
defm V_PIPEFLUSH : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
@@ -1106,11 +1159,36 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
}
+multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+ let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+ defm NAME : VOP1_Real_e32e64_vi <op>;
+ }
+
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ let Inst{42-40} = 6;
+ }
+
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+}
+
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
+defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
+defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
+defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
+}
+
//===----------------------------------------------------------------------===//
// GFX10
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index dddd0aacc1409..7788f1d82273d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -481,6 +481,30 @@ def shl_0_to_4 : PatFrag<
}];
}
+def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
+ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+ FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 0;
+}
+
+def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
+ VOP3_OPSEL> {
+ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+ FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+ FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+ op_sel0:$op_sel);
+ let HasClamp = 0;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 1;
+ let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
+ getAsmVOP3OpSel<3, HasClamp,
+ HasSrc0FloatMods, HasSrc1FloatMods,
+ HasSrc2FloatMods>.ret);
+ let HasExtVOP3DPP = 0;
+}
+
let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -526,6 +550,43 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
let SubtargetPredicate = isGFX940Plus in
defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+ SchedRW = [WriteFloatCvt] in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+ defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+ }
+
+ // These instructions have non-standard use of op_sel. In particular they are
+ // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
+ // src2 is used to hold the op_sel value.
+ let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+ defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+ defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+ }
+}
+
+class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
+ (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+>;
+
+class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
+ (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
+ (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
+ !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+>;
+
+foreach Index = [0, -1] in {
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
+}
+
+foreach Index = [0, 1, 2, 3] in {
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+ def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+}
+
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
(ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
@@ -1161,6 +1222,13 @@ multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
}
+multiclass VOP3OpSel_Real_gfx9_forced_opsel2<bits<10> op> {
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+ let Inst{13} = src2_modifiers{2}; // op_sel(2)
+ }
+}
+
multiclass VOP3Interp_Real_vi<bits<10> op> {
def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
@@ -1352,3 +1420,10 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
+
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
+defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
+defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
new file mode 100644
index 0000000000000..6261a08bc6191
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -0,0 +1,190 @@
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
+declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
+declare <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32, i1)
+declare <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
+declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0:
+; GCN: v_cvt_f32_bf8_e32 v0, v0{{$}}
+define float @test_cvt_f32_bf8_byte0(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
+define float @test_cvt_f32_bf8_byte1(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
+define float @test_cvt_f32_bf8_byte2(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
+define float @test_cvt_f32_bf8_byte3(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0:
+; GCN: v_cvt_f32_fp8_e32 v0, v0{{$}}
+define float @test_cvt_f32_fp8_byte0(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
+define float @test_cvt_f32_fp8_byte1(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
+define float @test_cvt_f32_fp8_byte2(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
+define float @test_cvt_f32_fp8_byte3(i32 %a) {
+ %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
+ ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0:
+; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}}
+define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+ ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1:
+; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
+define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
+ ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0:
+; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}}
+define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
+ ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1:
+; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
+define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+ ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0:
+; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1:
+; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0:
+; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1:
+; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
+ ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
+ %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
+ ret i32 %ret
+}
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 28d0c0311a3c0..a000955deed5c 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -401,3 +401,211 @@ buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 sc1
// GFX10: error: instruction not supported on this GPU
// GFX940: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1 ; encoding: [0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03]
buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xaa,0x02,0x7e]
+v_cvt_f32_bf8 v1, s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xaa,0x02,0x7e]
+v_cvt_f32_bf8 v1, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xab,0x02,0x7e]
+v_cvt_f32_bf8 v1, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00]
+v_cvt_f32_bf8 v1, s3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e64 v1, s3 mul:2 ; encoding: [0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_f32_bf8 v1, s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00]
+v_cvt_f32_bf8 v1, s3 clamp mul:2 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e64 v1, s3 clamp ; encoding: [0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_f32_bf8 v1, s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xa8,0x02,0x7e]
+v_cvt_f32_fp8 v1, s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xa8,0x02,0x7e]
+v_cvt_f32_fp8 v1, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xa9,0x02,0x7e]
+v_cvt_f32_fp8 v1, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00]
+v_cvt_f32_fp8 v1, s3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e64 v1, s3 mul:2 ; encoding: [0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_f32_fp8 v1, s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00]
+v_cvt_f32_fp8 v1, s3 clamp mul:2 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e64 v1, s3 clamp ; encoding: [0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_f32_fp8 v1, s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, 3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00]
+v_cvt_f32_fp8 v1, 3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xae,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xae,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xaf,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_pk_f32_bf8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 mul:2 ; encoding: [0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_pk_f32_bf8 v[2:3], s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 clamp mul:2 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 clamp ; encoding: [0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xac,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xac,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xad,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], 3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], 3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_pk_f32_fp8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 mul:2 ; encoding: [0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_pk_f32_fp8 v[2:3], s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 clamp mul:2 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 clamp ; encoding: [0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_bf8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20]
+v_cvt_pk_bf8_f32 v1, -v2, |v3|
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_pk_bf8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20]
+v_cvt_pk_fp8_f32 v1, -v2, |v3|
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_pk_fp8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_sr_bf8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, -|s2|, v3 ; encoding: [0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_sr_bf8_f32 v1, -|s2|, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_sr_fp8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, -|s2|, v3 ; encoding: [0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_sr_fp8_f32 v1, -|s2|, v3
diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index da38629c5afc2..784680342d14a 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -72,6 +72,30 @@ buffer_wbl2 scc
v_dot2_u32_u16 v0, 1, v0, s2 op_sel:[0,1,0,1] op_sel_hi:[0,0,1,1]
// GFX940: error: invalid op_sel operand
+v_cvt_f32_fp8 v1, sext(v3) src0_sel:BYTE_1
+// GFX940: error: not a valid operand.
+
+v_cvt_pk_f32_bf8 v[2:3], sext(v3) src0_sel:BYTE_1
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_bf8_f32 v1, v2, -v3
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_fp8_f32 v1, v2, -v3
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_fp8_f32 v1, v2, v3 clamp
+// GFX940: error: invalid operand for instruction
+
+v_cvt_sr_fp8_f32 v1, v2, v3 mul:2
+// GFX940: error: invalid operand for instruction
+
+v_cvt_pk_fp8_f32 v1, v2, v3 clamp
+// GFX940: error: invalid operand for instruction
+
+v_cvt_pk_fp8_f32 v1, v2, v3 mul:2
+// GFX940: error: invalid operand for instruction
+
s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO)
// GFX940: error: specified hardware register is not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
index 36b218a4a59b9..d937f8196b94c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
@@ -263,3 +263,159 @@
# GFX940: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1 ; encoding: [0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03]
0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX940: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xaa,0x02,0x7e]
+0x03,0xaa,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xaa,0x02,0x7e]
+0x83,0xaa,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xab,0x02,0x7e]
+0x03,0xab,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00]
+0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00
+
+# GFX940: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_f32_bf8_e64 v1, s3 mul:2 ; encoding: [0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08]
+0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_f32_bf8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00]
+0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00
+
+# GFX940: v_cvt_f32_bf8_e64 v1, s3 clamp ; encoding: [0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00]
+0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xa8,0x02,0x7e]
+0x03,0xa8,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xa8,0x02,0x7e]
+0x83,0xa8,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xa9,0x02,0x7e]
+0x03,0xa9,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00
+
+# GFX940: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_f32_fp8_e64 v1, s3 mul:2 ; encoding: [0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08]
+0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00
+
+# GFX940: v_cvt_f32_fp8_e64 v1, s3 clamp ; encoding: [0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00]
+0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, 3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xae,0x04,0x7e]
+0x03,0xae,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xae,0x04,0x7e]
+0x83,0xae,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xaf,0x04,0x7e]
+0x03,0xaf,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00]
+0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 mul:2 ; encoding: [0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08]
+0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00]
+0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 clamp ; encoding: [0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00]
+0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xac,0x04,0x7e]
+0x03,0xac,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xac,0x04,0x7e]
+0x83,0xac,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xad,0x04,0x7e]
+0x03,0xad,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], 3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 mul:2 ; encoding: [0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08]
+0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 clamp ; encoding: [0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00]
+0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20]
+0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20
+
+# GFX940: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20]
+0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20
+
+# GFX940: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, -|s2|, v3 ; encoding: [0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, -|s2|, v3 ; encoding: [0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20
More information about the cfe-commits
mailing list