[llvm] [AMDGPU] add back the true16 pattern for cvt_pk_rtz (PR #184857)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 11:54:15 PST 2026
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/184857
>From 248b92165df36e9e67f756f1de2dde98868fe976 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Mar 2026 14:06:48 -0500
Subject: [PATCH] add back the missing pattern
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +-
.../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 475 ++++++++++++++----
2 files changed, 373 insertions(+), 112 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 75a0b6b4c34fb..6f8c16955952b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -304,9 +304,17 @@ let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
(FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $src0, (as_hw_round_mode $round))>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 SupportedRoundMode:$round))),
+ (FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 $src0_modifiers, $src0, (as_hw_round_mode $round))>;
+
+let True16Predicate = NotUseRealTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 0))),
+ (V_CVT_PKRTZ_F16_F32_e32 $src0, (IMPLICIT_DEF))>;
+
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 0))),
- (V_CVT_PKRTZ_F16_F32_e32 $src0_modifiers, $src0)>;
+ (EXTRACT_SUBREG (V_CVT_PKRTZ_F16_F32_e64 $src0_modifiers, $src0, 0, (IMPLICIT_DEF)), lo16)>;
def : GCNPat <(v2f16 (build_vector (f16 (fptrunc_round f32:$src0, (i32 0))),
(f16 (fptrunc_round f32:$src1, (i32 0))))),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 2e33fcac9536c..2a5a8f5e068d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -1,44 +1,68 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,GFX11-SDAG %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,GFX11-GISEL %s
; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
; FIXME. gisel for fptrunc_round rtz
define amdgpu_gs half @v_fptrunc_round_f32_to_f16_tonearest(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearest")
ret half %res
}
define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
ret half %res
}
define amdgpu_gs half @v_fptrunc_round_f32_to_f16_downward(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
ret half %res
}
define amdgpu_gs half @v_fptrunc_round_f32_to_f16_towardzero(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
ret half %res
}
@@ -49,6 +73,10 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_f32_to_v2f16_towardzero(float %a, f
; SDAG-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_v2f16_towardzero:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_f32_to_v2f16_towardzero:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
@@ -63,10 +91,15 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_f32_to_v2f16_towardzero(float %a, f
}
define amdgpu_gs <2 x half> @v_fptrunc_round_poison_to_v2f16_towardzero(float %a) {
-; CHECK-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%lo = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
%tmp = insertelement <2 x half> poison, half %lo, i32 0
ret <2 x half> %tmp
@@ -80,6 +113,11 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_constant_to_v2f16_towardzero(float
; SDAG-NEXT: v_perm_b32 v0, s0, v0, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_constant_to_v2f16_towardzero:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_constant_to_v2f16_towardzero:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
@@ -92,18 +130,32 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_constant_to_v2f16_towardzero(float
}
define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT: v_add_f16_e32 v0, v0, v4
-; CHECK-NEXT: v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT: global_store_short v[2:3], v0, off
-; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT: v_add_f16_e32 v0, v0, v4
+; SDAG-NEXT: v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT: global_store_short v[2:3], v0, off
+; SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.h, v1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
%res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
%res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -114,18 +166,32 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a
}
define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v0
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT: v_add_f16_e32 v0, v4, v0
-; CHECK-NEXT: v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT: global_store_short v[2:3], v0, off
-; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT: v_add_f16_e32 v0, v4, v0
+; SDAG-NEXT: v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT: global_store_short v[2:3], v0, off
+; SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v4.l, v0
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.h, v1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v4.l, v0.l
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
%res2 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
%res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -136,17 +202,30 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float
}
define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT: v_cvt_pkrtz_f16_f32_e32 v4, v1, v0
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CHECK-NEXT: v_add_f16_e32 v0, v0, v4
-; CHECK-NEXT: v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT: global_store_short v[2:3], v0, off
-; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT: v_cvt_pkrtz_f16_f32_e32 v4, v1, v0
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SDAG-NEXT: v_add_f16_e32 v0, v0, v4
+; SDAG-NEXT: v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT: global_store_short v[2:3], v0, off
+; SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e64 v4, v0, s0
+; GFX11-SDAG-NEXT: v_cvt_pk_rtz_f16_f32_e64 v5, v1, s0
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.h, v4.l, v5.l
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
%res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.towardzero")
%res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
@@ -157,14 +236,23 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(floa
}
define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
%bitcast = bitcast half %res to i16
%ret = zext i16 %bitcast to i32
@@ -172,14 +260,23 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addr
}
define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_downward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_downward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
%bitcast = bitcast half %res to i16
%ret = zext i16 %bitcast to i32
@@ -187,20 +284,34 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr ad
}
define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float inreg %a, float inreg %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v3
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT: v_add_f16_e32 v2, v2, v4
-; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT: global_store_short v[0:1], v2, off
-; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v2, s0
+; SDAG-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v3
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT: v_add_f16_e32 v2, v2, v4
+; SDAG-NEXT: v_add_f16_e32 v2, v3, v2
+; SDAG-NEXT: global_store_short v[0:1], v2, off
+; SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v2.l, s0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v2.h, s1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, s1
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_f16_e32 v2.l, v2.l, v2.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v2.l, v3.l, v2.l
+; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_endpgm
%res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
%res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
%res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -219,6 +330,14 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -239,6 +358,14 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
; SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -270,6 +397,22 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
; SDAG-NEXT: global_store_dword v[4:5], v0, off
; SDAG-NEXT: s_endpgm
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, v2
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v1, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v3, v0
+; GFX11-SDAG-NEXT: global_store_b32 v[4:5], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -298,18 +441,31 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
}
define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
%bitcast = bitcast <2 x half> %res to <2 x i16>
%ret = zext <2 x i16> %bitcast to <2 x i32>
@@ -317,18 +473,31 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> in
}
define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, s1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-SDAG-NEXT: ; return to shader part epilog
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
%bitcast = bitcast <2 x half> %res to <2 x i16>
%ret = zext <2 x i16> %bitcast to <2 x i32>
@@ -362,6 +531,22 @@ define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
; SDAG-NEXT: global_store_dword v[0:1], v2, off
; SDAG-NEXT: s_endpgm
;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v2.h, s1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v2.l, s0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, s3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, s2
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v4.h, s3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v4.l, s2
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT: v_pk_add_f16 v2, v2, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_pk_add_f16 v2, v4, v2
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT: s_endpgm
; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
; GISEL: ; %bb.0:
; GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -403,6 +588,15 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -425,6 +619,15 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
; SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -449,6 +652,16 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -475,6 +688,16 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
; SDAG-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -507,6 +730,21 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v7.h, v7
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v5.h, v5
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v5.l, v4
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v7.l, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v7
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -545,6 +783,21 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
; SDAG-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; SDAG-NEXT: ; return to shader part epilog
;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v7.h, v7
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v5.h, v5
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v5.l, v4
+; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v7.l, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v7
+; GFX11-SDAG-NEXT: ; return to shader part epilog
; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
More information about the llvm-commits
mailing list