[clang] [llvm] [AMDGPU] Add support for `v_cvt_f16_fp8` on gfx1250 (PR #146302)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 29 19:59:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
Co-authored-by: Shilei Tian <i@<!-- -->tianshilei.me>
---
Patch is 61.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146302.diff
26 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+38)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+4)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+19-2)
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+11-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll (+171)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s (+12)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s (+15)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s (+8)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s (+12)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s (+8)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s (+12)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s (+27)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s (+27)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s (+20)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s (+24)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s (+28)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s (+28-4)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt (+21-2)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt (+13-2)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt (+13-2)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt (+36)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt (+24)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt (+32)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4e28f3bb7ef81..41fe1ebc4c2ce 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -655,6 +655,7 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 864e301859682..150d4a243f9e2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -15,6 +15,44 @@ void test_setprio_inc_wg() {
__builtin_amdgcn_s_setprio_inc_wg(10);
}
+// CHECK-LABEL: @test_cvt_f16_fp8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP0]], i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP2]], i64 0
+// CHECK-NEXT: store half [[TMP1]], ptr addrspace(1) [[ARRAYIDX]], align 2
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP3]], i32 1)
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP5]], i64 1
+// CHECK-NEXT: store half [[TMP4]], ptr addrspace(1) [[ARRAYIDX1]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP6]], i32 2)
+// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP8]], i64 2
+// CHECK-NEXT: store half [[TMP7]], ptr addrspace(1) [[ARRAYIDX2]], align 2
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call half @llvm.amdgcn.cvt.f16.fp8(i32 [[TMP9]], i32 3)
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP11]], i64 3
+// CHECK-NEXT: store half [[TMP10]], ptr addrspace(1) [[ARRAYIDX3]], align 2
+// CHECK-NEXT: ret void
+//
+void test_cvt_f16_fp8(global half* out, int a)
+{
+ out[0] = __builtin_amdgcn_cvt_f16_fp8(a, 0);
+ out[1] = __builtin_amdgcn_cvt_f16_fp8(a, 1);
+ out[2] = __builtin_amdgcn_cvt_f16_fp8(a, 2);
+ out[3] = __builtin_amdgcn_cvt_f16_fp8(a, 3);
+}
+
// CHECK-LABEL: @test_cvt_pk_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index b69fcb5f445bc..b8a71a5ba98a6 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -4,3 +4,7 @@
void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
+
+void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
+ __builtin_amdgcn_cvt_f16_fp8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_fp8' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ce37702b91486..63d649f9d38a1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3500,6 +3500,16 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;
+//===----------------------------------------------------------------------===//
+// gfx1250 intrinsics
+// ===----------------------------------------------------------------------===//
+
+// llvm.amdgcn.cvt.f16.fp8 half vdst, int srcA, imm byte_sel [0..3]
+def int_amdgcn_cvt_f16_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_fp8">,
+ DefaultAttrsIntrinsic<[llvm_half_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6874657a4ffe7..2cf9c73e3ec81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4596,6 +4596,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_sr_bf8_f32:
case Intrinsic::amdgcn_cvt_sr_bf16_f32:
case Intrinsic::amdgcn_cvt_sr_f16_f32:
+ case Intrinsic::amdgcn_cvt_f16_fp8:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index d504c8134202d..55e7eb15bd5a0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -687,6 +687,12 @@ class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32
let HasModifiers = 0;
}
+let IsSingle = 0, HasOpSel = 1, HasModifiers = 1 in {
+def V_CVT_F16_F8_Profile : VOPProfile_Base_CVT_F_F8_ByteSel<f16>;
+def V_CVT_F16_F8_True16_Profile : VOP3_Profile_True16<V_CVT_F16_F8_Profile>;
+def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
+}
+
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
@@ -702,9 +708,10 @@ let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
}
}
-class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
+class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSel = 0> : GCNPat<
(node i32:$src0, timm:$byte_sel),
- (inst $src0, (as_i32timm $byte_sel))
+ !if(HasOpSel, (inst 0, $src0, (as_i32timm $byte_sel)),
+ (inst $src0, (as_i32timm $byte_sel)))
>;
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
@@ -738,6 +745,8 @@ def VOPProfile_CVT_PK_F16_F8_fake16 : VOP3_Profile_Fake16<VOPProfile_CVT_PK_F16_
let SubtargetPredicate = isGFX1250Plus in {
let mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
+ defm V_CVT_F16_FP8 : VOP1Inst_t16_with_profiles<"v_cvt_f16_fp8",
+ V_CVT_F16_F8_Profile, V_CVT_F16_F8_True16_Profile, V_CVT_F16_F8_Fake16_Profile>;
defm V_CVT_PK_F16_FP8 : VOP1Inst_t16_with_profiles<"v_cvt_pk_f16_fp8",
VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
int_amdgcn_cvt_pk_f16_fp8>;
@@ -745,6 +754,13 @@ let SubtargetPredicate = isGFX1250Plus in {
VOPProfile_CVT_PK_F16_F8, VOPProfile_CVT_PK_F16_F8_true16, VOPProfile_CVT_PK_F16_F8_fake16,
int_amdgcn_cvt_pk_f16_bf8>;
}
+
+ let True16Predicate = UseRealTrue16Insts in {
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_t16_e64, 1>;
+ }
+ let True16Predicate = UseFakeTrue16Insts in {
+ def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
+ }
} // End SubtargetPredicate = isGFX1250Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -1082,6 +1098,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
+defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 1e47acb5fde4f..f8edfec1100a2 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -348,7 +348,8 @@ class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<2> byte_sel;
let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
- let Inst{14-13} = 0; // op_sel2/3
+ let Inst{13} = !if(!and(p.HasOpSel, p.HasSrc2), src2_modifiers{2}, 0);
+ let Inst{14} = !if(!and(p.HasOpSel, p.HasDst), src0_modifiers{3}, 0);
}
class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
@@ -526,7 +527,7 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
bits<9> scale_src1;
//MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value
- //where opsel and opselHi are in 3rd and 4th bit.
+ //where opsel and opselHi are in 3rd and 4th bit.
bits<4> src0_modifiers;
bits<4> src1_modifiers;
@@ -869,14 +870,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
// 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
- let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
- !if(P.HasFP8SrcByteSel, byte_sel{1}, ?));
- let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
- !if(P.HasFP8SrcByteSel, byte_sel{0}, ?));
- let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
- !if(P.HasFP8DstByteSel, byte_sel{0}, ?));
- let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
- !if(P.HasFP8DstByteSel, byte_sel{1}, ?));
+ let Inst{11} = !if(P.HasFP8SrcByteSel, byte_sel{1},
+ !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0), ?));
+ let Inst{12} = !if(P.HasFP8SrcByteSel, byte_sel{0},
+ !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0), ?));
+ let Inst{13} = !if(P.HasFP8DstByteSel, byte_sel{0},
+ !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0), ?));
+ let Inst{14} = !if(P.HasFP8DstByteSel, byte_sel{1},
+ !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0), ?));
let Inst{15} = !if(P.HasClamp, clamp, 0);
let Inst{25-16} = op;
let Inst{31-26} = 0x35;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
index 243f6c4d23732..25889ded91681 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
@@ -4,6 +4,177 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-REAL16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL-FAKE16 %s
+declare half @llvm.amdgcn.cvt.f16.bf8(i32, i32)
+declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32)
+declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16)
+declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16)
+
+define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte0:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e32 v0.l, v0
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 0)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte1:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte1:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 1)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte2:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte2:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 2)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
+ %cvt = tail call half @llvm.amdgcn.cvt.f16.fp8(i32 %a, i32 3)
+ %ret = fpext half %cvt to float
+ ret float %ret
+}
+
+define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
+; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
+; GFX1250-SDAG-REAL16: ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.h, v0 byte_sel:3
+; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
+; GFX1250-SDAG-FAKE16: ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
+; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
+; GFX1250-GISEL-REAL16: ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
+; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
+; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/146302
More information about the llvm-commits
mailing list