[llvm] [AMDGPU][True16][MC] Implement V_CVT_PK_F32_FP8/BF8 (PR #116106)

Wed Nov 13 13:01:05 PST 2024

https://github.com/Sisyph created https://github.com/llvm/llvm-project/pull/116106

This is a stacked PR, please review only the top commit. I will rebase when the dependent commits are landed.

>From 058ab9f384fbd90a49557ab7f99c82c042d30cf8 Mon Sep 17 00:00:00 2001
From: Joe Nash <joseph.nash at amd.com>
Date: Tue, 12 Nov 2024 18:08:19 -0500
Subject: [PATCH 1/2] [AMDGPU][True16][MC] Copy True16Predicate from pseudo to
 real in VOP1

This is a necessary change for consistency and an upcoming patch.
Cleanup an affected extra whitespace and wrong CHECK prefix in v_swap_b16.
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td           | 3 ++-
 llvm/test/MC/AMDGPU/gfx12_asm_vop1.s                 | 6 +++---
 llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 6b50ed95931765..4d550644504a7e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -76,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates    = ps.OtherPredicates;
+  let True16Predicate    = ps.True16Predicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -735,7 +736,7 @@ def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
                      VOPSrcEncodedDstOperand_t16Lo128:$vdst1);
   let Ins32 = (ins VOPSrcEncodedDstOperand_t16Lo128:$src0,
                    VOPDstOperand_t16Lo128:$src1);
-  let Asm32 = " $vdst, $src0";
+  let Asm32 = "$vdst, $src0";
 }
 
 let SubtargetPredicate = isGFX11Plus in {
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index e897d01aac7916..b9ee13dcad6e77 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -3555,13 +3555,13 @@ v_sqrt_f64 v[254:255], 0xaf123456
 // GFX12: v_sqrt_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 
 v_swap_b16 v5.l, v1.h
-// GFX12: v_swap_b16  v5.l, v1.h ; encoding: [0x81,0xcd,0x0a,0x7e]
+// GFX12: v_swap_b16 v5.l, v1.h ; encoding: [0x81,0xcd,0x0a,0x7e]
 
 v_swap_b16 v5.h, v1.l
-// GFX12: v_swap_b16  v5.h, v1.l ; encoding: [0x01,0xcd,0x0a,0x7f]
+// GFX12: v_swap_b16 v5.h, v1.l ; encoding: [0x01,0xcd,0x0a,0x7f]
 
 v_swap_b16 v127.l, v127.l
-// GFX12: v_swap_b16  v127.l, v127.l ; encoding: [0x7f,0xcd,0xfe,0x7e]
+// GFX12: v_swap_b16 v127.l, v127.l ; encoding: [0x7f,0xcd,0xfe,0x7e]
 
 v_swap_b32 v5, v1
 // GFX12: v_swap_b32 v5, v1 ; encoding: [0x01,0xcb,0x0a,0x7e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index ec44cf9ad12923..fc96cff9a6c655 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -3499,10 +3499,10 @@
 # GFX11: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
 0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf
 
-# GFX11-TRUE16: v_swap_b16 v5.l, v1.h            ; encoding: [0x81,0xcd,0x0a,0x7e]
+# GFX11-REAL16: v_swap_b16 v5.l, v1.h                   ; encoding: [0x81,0xcd,0x0a,0x7e]
 0x81,0xcd,0x0a,0x7e
 
-# GFX11-TRUE16: v_swap_b16 v5.h, v1.l            ; encoding: [0x01,0xcd,0x0a,0x7f]
+# GFX11-REAL16: v_swap_b16 v5.h, v1.l                   ; encoding: [0x01,0xcd,0x0a,0x7f]
 0x01,0xcd,0x0a,0x7f
 
 # GFX11: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]

>From c1d6107ace404e0d6332fe09d4f6e3ce14c06f51 Mon Sep 17 00:00:00 2001
From: Joe Nash <joseph.nash at amd.com>
Date: Tue, 12 Nov 2024 18:13:34 -0500
Subject: [PATCH 2/2] [AMDGPU][True16][MC] Implement V_CVT_PK_F32_FP8/BF8

Existing Fake16 versions of these instructions do not support op_sel on
the _e32 encoding, which leaves a hole in the disassembler support.
Implement the true16 version of the instructions in the MC layer.
---
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 ++-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    | 51 +++++++++++--------
 llvm/test/MC/AMDGPU/gfx12_asm_vop1.s          | 18 ++++---
 .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 12 +++++
 .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt      | 18 ++++++-
 5 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0b228841ed27b2..4b93d6d36055b6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -589,8 +589,10 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
          Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
          Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
          Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
-         Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
-         Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
+         Opc == AMDGPU::V_CVT_PK_F32_BF8_fake16_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_FP8_fake16_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_BF8_t16_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_FP8_t16_e64_gfx12;
 }
 
 bool isGenericAtomic(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 4d550644504a7e..c743eb43e3465c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -634,17 +634,16 @@ let SubtargetPredicate = HasFP8ConversionInsts, OtherPredicates = [HasSDWA] in {
   }
 }
 
-
-// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
-def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfile<[v2f32, i32, untyped, untyped]> {
-  let HasOpSel = 1;
-  let HasClamp = 0;
-  let HasOMod = 0;
-  let HasExtDPP = 0;
-  let HasExtVOP3DPP = 0;
-  let AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, 0 /*HasModifiers*/, 0/*Src0HasMods*/, 0/*Src1HasMods*/,
-   0/*Src2HasMods*/, DstVT>.ret;
+let HasClamp = 0, HasOMod = 0, HasExtDPP = 0, HasExtVOP3DPP = 0,
+    HasOpSel = 1 in {
+  // Input modifiers are not supported
+  // NB: fake16 VOP1 does not support op_sel.
+  def VOPProfile_Base_CVT_PK_F32_F8_fake16 : VOPProfile_Fake16<VOPProfile<[v2f32, f16, untyped, untyped]>> {
+    let Src0Mod = IntT16InputMods<1/*IsFake16*/>;
+  }
+  def VOPProfile_Base_CVT_PK_F32_F8_t16 : VOPProfile_True16<VOPProfile<[v2f32, f16, untyped, untyped]>> {
+    let Src0Mod = IntT16InputMods<0/*IsFake16*/>;
+  }
 }
 
 class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
@@ -673,8 +672,15 @@ let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
     mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
   defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
   defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
-  defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
-  defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+
+  let True16Predicate = UseFakeTrue16Insts in {
+    defm V_CVT_PK_F32_FP8_fake16 : VOP1Inst<"v_cvt_pk_f32_fp8_fake16", VOPProfile_Base_CVT_PK_F32_F8_fake16>;
+    defm V_CVT_PK_F32_BF8_fake16 : VOP1Inst<"v_cvt_pk_f32_bf8_fake16", VOPProfile_Base_CVT_PK_F32_F8_fake16>;
+  }
+  let True16Predicate = UseRealTrue16Insts in {
+    defm V_CVT_PK_F32_FP8_t16 : VOP1Inst<"v_cvt_pk_f32_fp8_t16", VOPProfile_Base_CVT_PK_F32_F8_t16>;
+    defm V_CVT_PK_F32_BF8_t16 : VOP1Inst<"v_cvt_pk_f32_bf8_t16", VOPProfile_Base_CVT_PK_F32_F8_t16>;
+  }
 }
 
 class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
@@ -698,9 +704,9 @@ class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
 let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
   foreach Index = [0, -1] in {
     def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
-                                  V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
+                                  V_CVT_PK_F32_FP8_fake16_e32, V_CVT_PK_F32_FP8_fake16_e64>;
     def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_bf8, Index,
-                                  V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_OP_SEL_e64>;
+                                  V_CVT_PK_F32_BF8_fake16_e32, V_CVT_PK_F32_BF8_fake16_e64>;
   }
 }
 
@@ -954,13 +960,14 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
 defm V_CVT_F32_FP8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
 defm V_CVT_F32_BF8      : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
 
-// Define VOP1 instructions using the pseudo instruction with its old profile and
-// VOP3 using the OpSel profile for the pseudo instruction.
-defm V_CVT_PK_F32_FP8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
-defm V_CVT_PK_F32_FP8   : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
-
-defm V_CVT_PK_F32_BF8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8", "v_cvt_pk_f32_bf8">;
-defm V_CVT_PK_F32_BF8   : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_OP_SEL", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16    : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8_t16    : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_t16", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16    : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_fake16 : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_fake16", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8_t16    : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_t16", "v_cvt_pk_f32_bf8">;
 
 defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
   "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index b9ee13dcad6e77..59d1030fb8a961 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -429,11 +429,14 @@ v_cvt_pk_f32_bf8_e32 v[2:3], 3
 v_cvt_pk_f32_bf8_e32 v[3:4], 3
 // GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], 3 ; encoding: [0x83,0xde,0x06,0x7e]
 
-v_cvt_pk_f32_bf8_e32 v[2:3], v3
-// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e]
+v_cvt_pk_f32_bf8_e32 v[2:3], v3.l
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3.l ; encoding: [0x03,0xdf,0x04,0x7e]
 
-v_cvt_pk_f32_bf8_e32 v[3:4], v3
-// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3 ; encoding: [0x03,0xdf,0x06,0x7e]
+v_cvt_pk_f32_bf8_e32 v[3:4], v3.l
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3.l ; encoding: [0x03,0xdf,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], v3.h
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3.h ; encoding: [0x83,0xdf,0x06,0x7e]
 
 v_cvt_pk_f32_fp8_e32 v[2:3], s3
 // GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e]
@@ -441,8 +444,11 @@ v_cvt_pk_f32_fp8_e32 v[2:3], s3
 v_cvt_pk_f32_fp8_e32 v[2:3], 3
 // GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e]
 
-v_cvt_pk_f32_fp8_e32 v[2:3], v3
-// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e]
+v_cvt_pk_f32_fp8_e32 v[2:3], v3.l
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3.l ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3.h
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3.h ; encoding: [0x83,0xdd,0x04,0x7e]
 
 v_cvt_f16_f32 v5.l, v1
 // GFX12: v_cvt_f16_f32_e32 v5.l, v1 ; encoding: [0x01,0x15,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index d7cca435abc249..6e1708dd879ccf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -486,6 +486,12 @@ v_cvt_pk_f32_bf8_e64 v[2:3], v3
 v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
 // GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
 
+v_cvt_pk_f32_bf8_e64 v[2:3], v3.h
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v255.h
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_pk_f32_fp8_e64 v[2:3], s3
 // GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
 
@@ -534,6 +540,12 @@ v_cvt_pk_f32_fp8_e64 v[3:4], v3
 v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0]
 // GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
 
+v_cvt_pk_f32_fp8_e64 v[3:4], v3.h
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], v255.h
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0xff,0x01,0x00,0x00]
+
 v_cvt_f16_f32_e64 v5.l, v1
 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index 9a181350f5b8ca..67618f45c31caa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -436,18 +436,32 @@
 # GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
 0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00
 
-# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
 0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
 
+# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0]      ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v255.h op_sel:[1,0]       ; encoding: [0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00]
+0x02,0x08,0xef,0xd5,0xff,0x01,0x00,0x00
+
 # GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
 0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
 
 # GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
 0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
 
-# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+# GFX12-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
 0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
 
+# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0]       ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v255.h op_sel:[1,0]       ; encoding: [0x02,0x08,0xee,0xd5,0xff,0x01,0x00,0x00]
+0x02,0x08,0xee,0xd5,0xff,0x01,0x00,0x00
+
 # GFX12-REAL16: v_cvt_f16_f32_e64 v5.l, v1              ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 # GFX12-FAKE16: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00