[llvm] [AMDGPU][True16][MC][CodeGen] true16 mode for v_cvt_pk_bf8/fp8_f32 (PR #141881)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 29 06:53:54 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Update true16/fake16 profile with v_cvt_pk_bf8/fp8_f32, keeping the vdst_in profile, and update codegen pattern.
update mc test and codegen test.
---
Patch is 66.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141881.diff
10 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+8-4)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+64-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll (+94-45)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+24-12)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+42-36)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+22-16)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+36-6)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+84-18)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+44-8)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e64001fd655c9..d3a3c90778c12 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8960,10 +8960,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
// Adding vdst_in operand is already covered for these DPP instructions in
// cvtVOP3DPP.
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
- !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
- Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
- Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
+ !(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx12 ||
+ Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 84a6aeacc226a..083345d4d1e12 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2822,6 +2822,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 0252c4f1b0929..2190111fd8419 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -570,6 +570,36 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
let HasExtVOP3DPP = 1;
}
+def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
+ defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ 0, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ Tail);
+ let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+ Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
+ HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
+ Src2ModVOP3DPP, false>.ret,
+ Tail);
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 1;
+}
+
+def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
+ defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
+ let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ 0, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ Tail);
+ let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+ Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
+ HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
+ Src2ModVOP3DPP, false>.ret,
+ Tail);
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 1;
+}
+
def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
VOP3_OPSEL> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
@@ -675,8 +705,12 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
- defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
- defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+ defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
+ VOP3_CVT_PK_F8_F32_Profile_t16,
+ VOP3_CVT_PK_F8_F32_Profile_fake16>;
+ defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
+ VOP3_CVT_PK_F8_F32_Profile_t16,
+ VOP3_CVT_PK_F8_F32_Profile_fake16>;
let SubtargetPredicate = isGFX12Plus in {
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
@@ -698,6 +732,21 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
>;
+multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
+def : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
+ (REG_SEQUENCE VGPR_32,
+ (i16 (EXTRACT_SUBREG $old, lo16)), lo16,
+ (i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
+>;
+def : GCNPat<
+ (i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
+ (REG_SEQUENCE VGPR_32,
+ (i16 (inst 0, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
+ (i16 (EXTRACT_SUBREG $old, hi16)), hi16)
+>;
+}
+
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
@@ -712,9 +761,20 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
+let True16Predicate = NotHasTrue16BitInsts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}
+let True16Predicate = UseFakeTrue16Insts in {
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
+ def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
+}
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
+defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
+}
let SubtargetPredicate = isGFX940Plus in {
foreach Index = [0, 1, 2, 3] in {
@@ -1642,8 +1702,8 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
-defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
-defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
+defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
+defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 1a46e6f6afcd7..16d32b73b9b0d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
@@ -275,17 +276,29 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.l, v0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
}
@@ -299,17 +312,29 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.h, v0, v1 op_sel:[0,0,1]
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
}
@@ -322,17 +347,29 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
}
@@ -346,17 +383,29 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1]
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index afd4af92b4f27..08b6b26d1f260 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1169,23 +1169,35 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
-v_cvt_pk_fp8_f32 v1, v2, v3
-// GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1.l, v2, v3
+// GFX12: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
-v_cvt_pk_fp8_f32 v1, -v2, |v3|
-// GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_pk_fp8_f32 v1.l, -v2, |v3|
+// GFX12: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
-v_cvt_pk_fp8_f32 v1, s2, 3
-// GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+v_cvt_pk_fp8_f32 v1.l, s2, 3
+// GFX12: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
-v_cvt_pk_bf8_f32 v1, v2, v3
-// GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+v_cvt_pk_fp8_f32 v1.h v2, v3
+// GFX12: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
-v_cvt_pk_bf8_f32 v1, -v2, |v3|
-// GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_pk_fp8_f32 v255.h v2, v3
+// GFX12: v_cvt_pk_fp8_f32 v255.h, v2, v3 op_sel:[0,0,1] ; encoding: [0xff,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
-v_cvt_pk_bf8_f32 v1, s2, 3
-// GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+v_cvt_pk_bf8_f32 v1.l, v2, v3
+// GFX12: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_bf8_f32 v1.l, -v2, |v3|
+// GFX12: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+
+v_cvt_pk_bf8_f32 v1.l, s2, 3
+// GFX12: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+
+v_cvt_pk_bf8_f32 v1.h, v2, v3
+// GFX12: v_cvt_pk_bf8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x6a,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_bf8_f32 v255.h, -v2, |v3|
+// GFX12: v_cvt_pk_bf8_f32 v255.h, -v2, |v3| op_sel:[0,0,1] ; encoding: [0xff,0x42,0x6a,0xd7,0x02,0x07,0x02,0x20]
v_cvt_sr_fp8_f32 v1, v2, v3
// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index cfd01ee02aa7e..6284ca4726928 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1209,59 +1209,65 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m
v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
-v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
-// GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+v_cvt_pk_bf8_f32_e64_dpp v1.l, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// G...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/141881
More information about the llvm-commits
mailing list