[llvm] [AMDGPU] add back the true16 pattern for cvt_pk_rtz (PR #184857)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 5 11:54:15 PST 2026


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/184857

>From 248b92165df36e9e67f756f1de2dde98868fe976 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Mar 2026 14:06:48 -0500
Subject: [PATCH] add back the missing pattern

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  10 +-
 .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 475 ++++++++++++++----
 2 files changed, 373 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 75a0b6b4c34fb..6f8c16955952b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -304,9 +304,17 @@ let True16Predicate = UseFakeTrue16Insts in
 def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
      (FPTRUNC_ROUND_F16_F32_PSEUDO_fake16_e32 $src0, (as_hw_round_mode $round))>;
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F16_F32_PSEUDO_t16_e64 $src0_modifiers, $src0, (as_hw_round_mode $round))>;
+
+let True16Predicate = NotUseRealTrue16Insts in
+def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 0))),
+     (V_CVT_PKRTZ_F16_F32_e32 $src0, (IMPLICIT_DEF))>;
+
 let True16Predicate = UseRealTrue16Insts in
 def : GCNPat <(f16 (fptrunc_round (f32 (VOP3OpSelMods f32:$src0, i32:$src0_modifiers)), (i32 0))),
-     (V_CVT_PKRTZ_F16_F32_e32 $src0_modifiers, $src0)>;
+     (EXTRACT_SUBREG (V_CVT_PKRTZ_F16_F32_e64 $src0_modifiers, $src0, 0, (IMPLICIT_DEF)), lo16)>;
 
 def : GCNPat <(v2f16 (build_vector (f16 (fptrunc_round f32:$src0, (i32 0))),
                                  (f16 (fptrunc_round f32:$src1, (i32 0))))),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 2e33fcac9536c..2a5a8f5e068d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -1,44 +1,68 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,GFX11-SDAG %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,GFX11-GISEL %s
 ; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
 
 ; FIXME. gisel for fptrunc_round rtz
 
 define amdgpu_gs half @v_fptrunc_round_f32_to_f16_tonearest(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_tonearest:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.tonearest")
   ret half %res
 }
 
 define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
   ret half %res
 }
 
 define amdgpu_gs half @v_fptrunc_round_f32_to_f16_downward(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
   ret half %res
 }
 
 define amdgpu_gs half @v_fptrunc_round_f32_to_f16_towardzero(float %a) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
   ret half %res
 }
@@ -49,6 +73,10 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_f32_to_v2f16_towardzero(float %a, f
 ; SDAG-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_v2f16_towardzero:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_f32_to_v2f16_towardzero:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
@@ -63,10 +91,15 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_f32_to_v2f16_towardzero(float %a, f
 }
 
 define amdgpu_gs <2 x half> @v_fptrunc_round_poison_to_v2f16_towardzero(float %a) {
-; CHECK-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_poison_to_v2f16_towardzero:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %lo = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
   %tmp = insertelement <2 x half> poison, half %lo, i32 0
   ret <2 x half> %tmp
@@ -80,6 +113,11 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_constant_to_v2f16_towardzero(float
 ; SDAG-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_constant_to_v2f16_towardzero:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v0, v0, s0
+; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_constant_to_v2f16_towardzero:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
@@ -92,18 +130,32 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_constant_to_v2f16_towardzero(float
 }
 
 define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT:    v_add_f16_e32 v0, v0, v4
-; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT:    global_store_short v[2:3], v0, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_add_f16_e32 v0, v0, v4
+; SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT:    global_store_short v[2:3], v0, off
+; SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.h, v1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
   %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
   %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
   %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -114,18 +166,32 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_upward_multiple_calls(float %a
 }
 
 define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT:    v_add_f16_e32 v0, v4, v0
-; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT:    global_store_short v[2:3], v0, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_add_f16_e32 v0, v4, v0
+; SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT:    global_store_short v[2:3], v0, off
+; SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_downward_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v4.l, v0
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.h, v1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v4.l, v0.l
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
   %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
   %res2 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
   %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -136,17 +202,30 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_downward_multiple_calls(float
 }
 
 define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(float %a, float %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
-; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v4, v1, v0
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CHECK-NEXT:    v_add_f16_e32 v0, v0, v4
-; CHECK-NEXT:    v_add_f16_e32 v0, v1, v0
-; CHECK-NEXT:    global_store_short v[2:3], v0, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v0
+; SDAG-NEXT:    v_cvt_pkrtz_f16_f32_e32 v4, v1, v0
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SDAG-NEXT:    v_add_f16_e32 v0, v0, v4
+; SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; SDAG-NEXT:    global_store_short v[2:3], v0, off
+; SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: v_fptrunc_round_f32_to_f16_towardzero_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v4, v0, s0
+; GFX11-SDAG-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v5, v1, s0
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.h, v4.l, v5.l
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
   %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.towardzero")
   %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.towardzero")
   %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
@@ -157,14 +236,23 @@ define amdgpu_gs void @v_fptrunc_round_f32_to_f16_towardzero_multiple_calls(floa
 }
 
 define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
   %bitcast = bitcast half %res to i16
   %ret = zext i16 %bitcast to i32
@@ -172,14 +260,23 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_upward(float inreg %a, ptr addr
 }
 
 define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_downward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.downward")
   %bitcast = bitcast half %res to i16
   %ret = zext i16 %bitcast to i32
@@ -187,20 +284,34 @@ define amdgpu_gs i32 @s_fptrunc_round_f32_to_f16_downward(float inreg %a, ptr ad
 }
 
 define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float inreg %a, float inreg %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT:    v_add_f16_e32 v2, v2, v4
-; CHECK-NEXT:    v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT:    global_store_short v[0:1], v2, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_add_f16_e32 v2, v2, v4
+; SDAG-NEXT:    v_add_f16_e32 v2, v3, v2
+; SDAG-NEXT:    global_store_short v[0:1], v2, off
+; SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_f32_to_f16_upward_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v2.l, s0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v2.h, s1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, s1
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v2.l, v2.l, v2.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v2.l, v3.l, v2.l
+; GFX11-SDAG-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_endpgm
   %res1 = call half @llvm.fptrunc.round.f16.f32(float %a, metadata !"round.upward")
   %res2 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.upward")
   %res3 = call half @llvm.fptrunc.round.f16.f32(float %b, metadata !"round.downward")
@@ -219,6 +330,14 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
 ; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -239,6 +358,14 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
 ; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -270,6 +397,22 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
 ; SDAG-NEXT:    global_store_dword v[4:5], v0, off
 ; SDAG-NEXT:    s_endpgm
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, v2
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v3, v0
+; GFX11-SDAG-NEXT:    global_store_b32 v[4:5], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -298,18 +441,31 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
 }
 
 define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
   %bitcast = bitcast <2 x half> %res to <2 x i16>
   %ret = zext <2 x i16> %bitcast to <2 x i32>
@@ -317,18 +473,31 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> in
 }
 
 define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
-; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
-; CHECK-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v0.l, s0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
   %bitcast = bitcast <2 x half> %res to <2 x i16>
   %ret = zext <2 x i16> %bitcast to <2 x i32>
@@ -362,6 +531,22 @@ define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
 ; SDAG-NEXT:    global_store_dword v[0:1], v2, off
 ; SDAG-NEXT:    s_endpgm
 ;
+; GFX11-SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v2.h, s1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v2.l, s0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, s3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, s2
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v4.h, s3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v4.l, s2
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v2, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v2, v4, v2
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -403,6 +588,15 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
 ; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -425,6 +619,15 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
 ; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -449,6 +652,16 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
 ; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -475,6 +688,16 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
 ; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
@@ -507,6 +730,21 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
 ; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v7.h, v7
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v5.h, v5
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v5.l, v4
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v7.l, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v7
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
@@ -545,6 +783,21 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
 ; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX11-SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v7.h, v7
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v5.h, v5
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.h, v3
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.h, v1
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v1.l, v0
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v3.l, v2
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v5.l, v4
+; GFX11-SDAG-NEXT:    v_cvt_f16_f32_e64 v7.l, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v3
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v7
+; GFX11-SDAG-NEXT:    ; return to shader part epilog
 ; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1



More information about the llvm-commits mailing list