[llvm] [AMDGPU] Implement codegen for GFX11+ V_CVT_PK_[IU]16_F32 (PR #168719)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 19 06:57:02 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
---
Patch is 50.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168719.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+8)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+22)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+59-172)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_sint.ll (+329)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_uint.ll (+273)
- (modified) llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll (+22-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e37d739fc25df..0a9ae52d5dabf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6806,6 +6806,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return splitTernaryVectorOp(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ if (AMDGPU::isGFX11Plus(*Subtarget) && Op.getValueType() == MVT::i16 &&
+ Op.getOperand(0).getValueType() == MVT::f32) {
+ // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
+ return Op;
+ }
return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..936eb545563ae 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -469,6 +469,14 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
// SchedRW = [WriteSFPU], isReMaterializable = 1
+let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
+ // Fallback patterns for f32->i16 conversion.
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)),
+ (S_CVT_I32_F32 $src0)>;
+ def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
+ (S_CVT_U32_F32 $src0)>;
+}
+
let hasSideEffects = 1 in {
let has_sdst = 0 in {
let Uses = [M0] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 872bde501cd2d..e1b22c6804544 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1721,6 +1721,28 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
+
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+ def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+
+ // Fallback patterns for f32->i16 conversion. These are only required because
+ // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above.
+ let True16Predicate = UseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>;
+ }
+ let True16Predicate = NotUseRealTrue16Insts in {
+ def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
+ def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
+ }
} // End SubtargetPredicate = isGFX11Plus
class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 28d7e6916e519..c9b2c8c08b41c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -35513,55 +35513,24 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <2 x bfloat> %x to <2 x i16>
ret <2 x i16> %op
}
@@ -35655,61 +35624,27 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
+; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <3 x bfloat> %x to <3 x i16>
ret <3 x i16> %op
}
@@ -35827,77 +35762,29 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX1250TRUE16: ; %bb.0:
-; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX1250TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
+; GFX11-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1250FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX1250FAKE16: ; %bb.0:
-; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX1250FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
-; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
+; GFX1250-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fptosi <4 x bfloat> %x to <4 x i16>
ret <4 x i16> %op
}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 0c5ed00b58d90..dea2dbe18a05d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s --check-prefixes=SI
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s --check-prefixes=VI
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG
declare float @llvm.fabs.f32(float) #1
@@ -28,6 +29,17 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) {
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -67,6 +79,17 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_i32_fabs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e64 v1, |s2|
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_i32_fabs:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
@@ -108,6 +131,16 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> %
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v1, s3
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
+;
; EG-LABEL: fp_to_sint_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
@@ -157,6 +190,20 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
+; GFX11-LABEL: fp_to_sint_v4i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cvt_i32_f3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/168719
More information about the llvm-commits
mailing list