[clang] 9fa5a6b - [AMDGPU] Support for gfx940 fp8 conversions

Mon Jul 18 11:48:59 PDT 2022

Author: Stanislav Mekhanoshin
Date: 2022-07-18T11:48:43-07:00
New Revision: 9fa5a6b7e8a292ec91b844a622836d2990ef5796

URL: https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796
DIFF: https://github.com/llvm/llvm-project/commit/9fa5a6b7e8a292ec91b844a622836d2990ef5796.diff

LOG: [AMDGPU] Support for gfx940 fp8 conversions

Differential Revision: https://reviews.llvm.org/D129902

Added: 
    clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/lib/Basic/Targets/AMDGPU.cpp
    clang/test/CodeGenOpenCL/amdgpu-features.cl
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/VOP1Instructions.td
    llvm/lib/Target/AMDGPU/VOP3Instructions.td
    llvm/test/MC/AMDGPU/gfx940_asm_features.s
    llvm/test/MC/AMDGPU/gfx940_err.s
    llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 68bcf546d177c..e9f25d783e596 100644

--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -346,5 +346,14 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x16_bf16, "V16fV4sV8sV16fiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x64_i8, "V4iV2iV4iV4iiIiIi", "nc", "mai-insts")
 TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x32_i8, "V16iV2iV4iV16iiIiIi", "nc", "mai-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN

diff  --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 50256d8e210c9..80f2601b0a245 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -250,6 +250,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
       break;
     case GK_GFX940:
       Features["gfx940-insts"] = true;
+      Features["fp8-insts"] = true;
       LLVM_FALLTHROUGH;
     case GK_GFX90A:
       Features["gfx90a-insts"] = true;

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index cb3a3eff01f70..ff288e530d17f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -64,7 +64,7 @@
 // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX940: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
+// GFX940: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
new file mode 100644
index 0000000000000..56d757012a5e7
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -0,0 +1,60 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+
+// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
+// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+void test_cvt_f32_bf8(global int* out, int a)
+{
+  *out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
+// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+void test_cvt_f32_fp8(global int* out, int a)
+{
+  *out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
+// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+void test_cvt_pk_f32_bf8(global v2f* out, int a)
+{
+  *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
+// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+void test_cvt_pk_f32_fp8(global v2f* out, int a)
+{
+  *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
+void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
+{
+  *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
+void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
+{
+  *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
+void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
+{
+  *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
+}
+
+// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
+// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
+void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
+{
+  *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e81224d9b8906..2936e495bb2df 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2320,6 +2320,58 @@ def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
 def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
 def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
 
+// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
+// byte_sel selects byte from srcA.
+def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
+def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
+// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
+def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
+  Intrinsic<[llvm_v2f32_ty],
+            [llvm_i32_ty, llvm_i1_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
+def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
+  Intrinsic<[llvm_v2f32_ty],
+            [llvm_i32_ty, llvm_i1_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
+// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
+def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
+  Intrinsic<[llvm_i32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
+def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
+  Intrinsic<[llvm_i32_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
+// byte_sel selects byte to write into vdst.
+def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
+  Intrinsic<[llvm_i32_ty],
+            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
+def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
+  Intrinsic<[llvm_i32_ty],
+            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 48b5814cd4820..49471478286f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -585,6 +585,12 @@ def FeatureMAIInsts : SubtargetFeature<"mai-insts",
   "Has mAI instructions"
 >;
 
+def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
+  "HasFP8Insts",
+  "true",
+  "Has fp8 and bf8 instructions"
+>;
+
 def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
   "HasPkFmacF16Inst",
   "true",
@@ -1124,6 +1130,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
    Feature64BitDPP,
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
+   FeatureFP8Insts,
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
@@ -1704,6 +1711,9 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
 def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
   AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
 
+def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
+  AssemblerPredicate<(all_of FeatureFP8Insts)>;
+
 def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
   AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
 

diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 8b2e51f836fc5..0a5feca0289c5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8257,6 +8257,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
   const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
 
+  if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+    Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
+    Inst.addOperand(Inst.getOperand(0));
+  }
+
   if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
@@ -9061,12 +9067,27 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
-        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::clamp) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTyClampSI, 0);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTyOModSI, 0);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::dst_sel) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::dst_unused) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTySdwaDstUnused,
+                              DstUnused::UNUSED_PRESERVE);
       }
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       break;
 

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d269d0945f3b1..8352337bb20a7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -145,6 +145,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasDot7Insts = false;
   bool HasDot8Insts = false;
   bool HasMAIInsts = false;
+  bool HasFP8Insts = false;
   bool HasPkFmacF16Inst = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
@@ -721,6 +722,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasMAIInsts;
   }
 
+  bool hasFP8Insts() const {
+    return HasFP8Insts;
+  }
+
   bool hasPkFmacF16Inst() const {
     return HasPkFmacF16Inst;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23afd6556bc9b..ffbd08514deba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -324,7 +324,8 @@ class isFloatType<ValueType SrcVT> {
 
 // XXX - do v2i16 instructions?
 class isIntType<ValueType SrcVT> {
-  bit ret = !or(!eq(SrcVT.Value, i16.Value),
+  bit ret = !or(!eq(SrcVT.Value, i8.Value),
+                !eq(SrcVT.Value, i16.Value),
                 !eq(SrcVT.Value, i32.Value),
                 !eq(SrcVT.Value, i64.Value),
                 !eq(SrcVT.Value, v4i16.Value),
@@ -1411,6 +1412,10 @@ class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
 
 def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
 def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+def Bin32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32> {
+  let Name = "SDWAWithBin32InputMods";
+  let ParserMethod = "parseRegOrImm";
+}
 
 class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
   InputMods <matchClass> {
@@ -1419,6 +1424,7 @@ class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
 
 def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
 def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+def Bin32SDWAInputMods : IntSDWAInputMods<Bin32SDWAInputModsMatchClass>;
 
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";

diff  --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1d374a9f90ba9..73e4eb8cdc240 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -499,6 +499,59 @@ let SubtargetPredicate = isGFX9Only in {
   defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
 } // End SubtargetPredicate = isGFX9Only
 
+class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+  let HasExtSDWA = 1;
+  let HasExtSDWA9 = 1;
+  let HasExt = 1;
+  let DstRCSDWA = getVALUDstForVT<vt>.ret;
+  let InsSDWA = (ins Bin32SDWAInputMods:$src0_modifiers, Src0SDWA:$src0,
+                     clampmod:$clamp, omod:$omod, src0_sel:$src0_sel);
+  let AsmSDWA = "$vdst, $src0_modifiers$clamp$omod $src0_sel"; // No dst_sel
+  let AsmSDWA9 = AsmSDWA;
+  let EmitDstSel = 0;
+}
+
+def VOPProfileCVT_F32_F8    : VOPProfile_Base_CVT_F32_F8 <f32>;
+def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
+
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  defm V_CVT_F32_FP8    : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
+  defm V_CVT_F32_BF8    : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
+  defm V_CVT_PK_F32_FP8 : VOP1Inst<"v_cvt_pk_f32_fp8", VOPProfileCVT_PK_F32_F8>;
+  defm V_CVT_PK_F32_BF8 : VOP1Inst<"v_cvt_pk_f32_bf8", VOPProfileCVT_PK_F32_F8>;
+}
+
+class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+    (f32 (node i32:$src, index)),
+    !if (index,
+         (inst_sdwa 0, $src, 0, 0, index),
+         (inst_e32 $src))
+>;
+
+foreach Index = [0, 1, 2, 3] in {
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
+                       V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
+                       V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+}
+
+class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+    (v2f32 (node i32:$src, index)),
+    !if (index,
+         (inst_sdwa 0, $src, 0, 0, SDWA.WORD_1),
+         (inst_e32 $src))
+>;
+
+foreach Index = [0, -1] in {
+  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+                          V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+                          V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+}
+
 let SubtargetPredicate = isGFX10Plus in {
   defm V_PIPEFLUSH        : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
 
@@ -1106,11 +1159,36 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
 
 }
 
+multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
+  }
+
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+      let Inst{42-40} = 6;
+    }
+
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+}
+
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
 let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_F32_FP8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
+defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
+defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
+defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
+}
+
 //===----------------------------------------------------------------------===//
 // GFX10
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index dddd0aacc1409..7788f1d82273d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -481,6 +481,30 @@ def shl_0_to_4 : PatFrag<
   }];
 }
 
+def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          VGPR_32:$vdst_in, op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasExtVOP3DPP = 0;
+}
+
+def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
+                                              VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                          op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 1;
+  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
+                            getAsmVOP3OpSel<3, HasClamp,
+                                            HasSrc0FloatMods, HasSrc1FloatMods,
+                                            HasSrc2FloatMods>.ret);
+  let HasExtVOP3DPP = 0;
+}
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -526,6 +550,43 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
 let SubtargetPredicate = isGFX940Plus in
 defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
 
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+    defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+    defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+  }
+
+  // These instructions have non-standard use of op_sel. In particular they are
+  // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
+  // src2 is used to hold the op_sel value.
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+    defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+    defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+  }
+}
+
+class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
+    (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
+    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+>;
+
+class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
+    (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
+    (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
+          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+>;
+
+foreach Index = [0, -1] in {
+  def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
+  def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
+}
+
+foreach Index = [0, 1, 2, 3] in {
+  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+}
+
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
   // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
   (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
@@ -1161,6 +1222,13 @@ multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
             VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
 }
 
+multiclass VOP3OpSel_Real_gfx9_forced_opsel2<bits<10> op> {
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+    let Inst{13} = src2_modifiers{2}; // op_sel(2)
+  }
+}
+
 multiclass VOP3Interp_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
@@ -1352,3 +1420,10 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
 defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
 
 defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
+
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
+defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
+defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
new file mode 100644
index 0000000000000..6261a08bc6191
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -0,0 +1,190 @@
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
+declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
+declare <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32, i1)
+declare <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
+declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0:
+; GCN: v_cvt_f32_bf8_e32 v0, v0{{$}}
+define float @test_cvt_f32_bf8_byte0(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
+define float @test_cvt_f32_bf8_byte1(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
+define float @test_cvt_f32_bf8_byte2(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3:
+; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
+define float @test_cvt_f32_bf8_byte3(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0:
+; GCN: v_cvt_f32_fp8_e32 v0, v0{{$}}
+define float @test_cvt_f32_fp8_byte0(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
+define float @test_cvt_f32_fp8_byte1(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
+define float @test_cvt_f32_fp8_byte2(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3:
+; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
+define float @test_cvt_f32_fp8_byte3(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
+  ret float %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0:
+; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}}
+define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+  ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1:
+; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
+define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
+  ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0:
+; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}}
+define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
+  ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1:
+; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
+define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+  ret <2 x float> %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0:
+; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1:
+; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0:
+; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1:
+; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3:
+; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}}
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
+  ret i32 %ret
+}
+
+; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3:
+; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GCN: v_mov_b32_e32 v0, v2
+define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
+  ret i32 %ret
+}

diff  --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 28d0c0311a3c0..a000955deed5c 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -401,3 +401,211 @@ buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 sc1
 // GFX10:  error: instruction not supported on this GPU
 // GFX940: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1 ; encoding: [0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03]
 buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xaa,0x02,0x7e]
+v_cvt_f32_bf8 v1, s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xaa,0x02,0x7e]
+v_cvt_f32_bf8 v1, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xab,0x02,0x7e]
+v_cvt_f32_bf8 v1, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00]
+v_cvt_f32_bf8 v1, s3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e64 v1, s3 mul:2          ; encoding: [0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_f32_bf8 v1, s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00]
+v_cvt_f32_bf8 v1, s3 clamp mul:2 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_bf8_e64 v1, s3 clamp          ; encoding: [0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_f32_bf8 v1, s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xa8,0x02,0x7e]
+v_cvt_f32_fp8 v1, s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xa8,0x02,0x7e]
+v_cvt_f32_fp8 v1, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xa9,0x02,0x7e]
+v_cvt_f32_fp8 v1, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00]
+v_cvt_f32_fp8 v1, s3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e64 v1, s3 mul:2          ; encoding: [0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_f32_fp8 v1, s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00]
+v_cvt_f32_fp8 v1, s3 clamp mul:2 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_e64 v1, s3 clamp          ; encoding: [0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_f32_fp8 v1, s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_f32_fp8_sdwa v1, 3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00]
+v_cvt_f32_fp8 v1, 3 src0_sel:BYTE_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xae,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xae,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xaf,0x04,0x7e]
+v_cvt_pk_f32_bf8 v[2:3], v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_pk_f32_bf8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 mul:2   ; encoding: [0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_pk_f32_bf8 v[2:3], s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 clamp mul:2 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 clamp   ; encoding: [0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_pk_f32_bf8 v[2:3], s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xac,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], s3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xac,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xad,0x04,0x7e]
+v_cvt_pk_f32_fp8 v[2:3], v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], 3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], 3 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff]
+v_cvt_pk_f32_fp8 v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 mul:2   ; encoding: [0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08]
+v_cvt_pk_f32_fp8 v[2:3], s3 mul:2
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 clamp mul:2 src0_sel:WORD_1
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 clamp   ; encoding: [0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00]
+v_cvt_pk_f32_fp8 v[2:3], s3 clamp
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_bf8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20]
+v_cvt_pk_bf8_f32 v1, -v2, |v3|
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_pk_bf8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20]
+v_cvt_pk_fp8_f32 v1, -v2, |v3|
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_pk_fp8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_sr_bf8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_bf8_f32 v1, -|s2|, v3          ; encoding: [0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_sr_bf8_f32 v1, -|s2|, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00]
+v_cvt_sr_fp8_f32 v1, s2, 3
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00]
+v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1]
+
+// NOT-GFX940: error: instruction not supported on this GPU
+// GFX940: v_cvt_sr_fp8_f32 v1, -|s2|, v3          ; encoding: [0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20]
+v_cvt_sr_fp8_f32 v1, -|s2|, v3

diff  --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index da38629c5afc2..784680342d14a 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -72,6 +72,30 @@ buffer_wbl2 scc
 v_dot2_u32_u16 v0, 1, v0, s2 op_sel:[0,1,0,1] op_sel_hi:[0,0,1,1]
 // GFX940: error: invalid op_sel operand
 
+v_cvt_f32_fp8 v1, sext(v3) src0_sel:BYTE_1
+// GFX940: error: not a valid operand.
+
+v_cvt_pk_f32_bf8 v[2:3], sext(v3) src0_sel:BYTE_1
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_bf8_f32 v1, v2, -v3
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_fp8_f32 v1, v2, -v3
+// GFX940: error: not a valid operand.
+
+v_cvt_sr_fp8_f32 v1, v2, v3 clamp
+// GFX940: error: invalid operand for instruction
+
+v_cvt_sr_fp8_f32 v1, v2, v3 mul:2
+// GFX940: error: invalid operand for instruction
+
+v_cvt_pk_fp8_f32 v1, v2, v3 clamp
+// GFX940: error: invalid operand for instruction
+
+v_cvt_pk_fp8_f32 v1, v2, v3 mul:2
+// GFX940: error: invalid operand for instruction
+
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO)
 // GFX940: error: specified hardware register is not supported on this GPU
 

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
index 36b218a4a59b9..d937f8196b94c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt
@@ -263,3 +263,159 @@
 
 # GFX940: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 sc1 ; encoding: [0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03]
 0x00,0x80,0x40,0xe1,0x00,0x04,0x02,0x03
+
+# GFX940: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xaa,0x02,0x7e]
+0x03,0xaa,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xaa,0x02,0x7e]
+0x83,0xaa,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xab,0x02,0x7e]
+0x03,0xab,0x02,0x7e
+
+# GFX940: v_cvt_f32_bf8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00]
+0xf9,0xaa,0x02,0x7e,0x03,0x06,0x81,0x00
+
+# GFX940: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xaa,0x02,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_f32_bf8_e64 v1, s3 mul:2          ; encoding: [0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08]
+0x01,0x00,0x95,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_f32_bf8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00]
+0xf9,0xaa,0x02,0x7e,0x03,0x66,0x81,0x00
+
+# GFX940: v_cvt_f32_bf8_e64 v1, s3 clamp          ; encoding: [0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00]
+0x01,0x80,0x95,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xa8,0x02,0x7e]
+0x03,0xa8,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xa8,0x02,0x7e]
+0x83,0xa8,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xa9,0x02,0x7e]
+0x03,0xa9,0x02,0x7e
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, s3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x03,0x06,0x81,0x00
+
+# GFX940: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xa8,0x02,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_f32_fp8_e64 v1, s3 mul:2          ; encoding: [0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08]
+0x01,0x00,0x94,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, s3 clamp mul:2 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x03,0x66,0x81,0x00
+
+# GFX940: v_cvt_f32_fp8_e64 v1, s3 clamp          ; encoding: [0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00]
+0x01,0x80,0x94,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_f32_fp8_sdwa v1, 3 src0_sel:BYTE_1 ; encoding: [0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00]
+0xf9,0xa8,0x02,0x7e,0x83,0x06,0x81,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xae,0x04,0x7e]
+0x03,0xae,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xae,0x04,0x7e]
+0x83,0xae,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xaf,0x04,0x7e]
+0x03,0xaf,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00]
+0xf9,0xae,0x04,0x7e,0x03,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xae,0x00,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 mul:2   ; encoding: [0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08]
+0x02,0x00,0x97,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_pk_f32_bf8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00]
+0xf9,0xae,0x04,0x7e,0x03,0x66,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_bf8_e64 v[2:3], s3 clamp   ; encoding: [0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00]
+0x02,0x80,0x97,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xac,0x04,0x7e]
+0x03,0xac,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xac,0x04,0x7e]
+0x83,0xac,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xad,0x04,0x7e]
+0x03,0xad,0x04,0x7e
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x03,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], 3 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x83,0x06,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_dpp v[0:1], v3 quad_perm:[0,2,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff]
+0xfa,0xac,0x00,0x7e,0x03,0x58,0x00,0xff
+
+# GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 mul:2   ; encoding: [0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08]
+0x02,0x00,0x96,0xd1,0x03,0x00,0x00,0x08
+
+# GFX940: v_cvt_pk_f32_fp8_sdwa v[2:3], s3 clamp mul:2 src0_sel:WORD_1 ; encoding: [0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00]
+0xf9,0xac,0x04,0x7e,0x03,0x66,0x85,0x00
+
+# GFX940: v_cvt_pk_f32_fp8_e64 v[2:3], s3 clamp   ; encoding: [0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00]
+0x02,0x80,0x96,0xd1,0x03,0x00,0x00,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa3,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20]
+0x01,0x02,0xa3,0xd2,0x02,0x07,0x02,0x20
+
+# GFX940: v_cvt_pk_bf8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa3,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_pk_bf8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa3,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa2,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20]
+0x01,0x02,0xa2,0xd2,0x02,0x07,0x02,0x20
+
+# GFX940: v_cvt_pk_fp8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa2,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_pk_fp8_f32 v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa2,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa5,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa5,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_bf8_f32 v1, -|s2|, v3          ; encoding: [0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0xa5,0xd2,0x02,0x06,0x02,0x20
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x00,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00]
+0x01,0x00,0xa4,0xd2,0x02,0x06,0x01,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x60,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, v2, v3 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00]
+0x01,0x40,0xa4,0xd2,0x02,0x07,0x02,0x00
+
+# GFX940: v_cvt_sr_fp8_f32 v1, -|s2|, v3          ; encoding: [0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20]
+0x01,0x01,0xa4,0xd2,0x02,0x06,0x02,0x20