[llvm] 401658c - AMDGPU: Fix vector handling of fptrunc_round

Wed Apr 24 03:55:09 PDT 2024

Author: Matt Arsenault
Date: 2024-04-24T12:42:55+02:00
New Revision: 401658cb4bad619254316ff936c527fec0861472

URL: https://github.com/llvm/llvm-project/commit/401658cb4bad619254316ff936c527fec0861472
DIFF: https://github.com/llvm/llvm-project/commit/401658cb4bad619254316ff936c527fec0861472.diff

LOG: AMDGPU: Fix vector handling of fptrunc_round

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
    llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d55091e2e71739..6a76ad7f5db749 100644

--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4764,6 +4764,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
   case G_BITCAST:
     return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
+  case G_INTRINSIC_FPTRUNC_ROUND:
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
   default:
     return UnableToLegalize;
   }

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 7a9cfdf5c3fda9..1de43a4f60e3a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -420,6 +420,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
+  case ISD::FPTRUNC_ROUND:
   case ISD::FMA:
   case ISD::SIGN_EXTEND_INREG:
   case ISD::ANY_EXTEND_VECTOR_INREG:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 8496ed1cfae821..4526efc0d8fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
 
 define amdgpu_gs half @v_fptrunc_round_f32_to_f16_upward(float %a) {
 ; CHECK-LABEL: v_fptrunc_round_f32_to_f16_upward:
@@ -98,3 +98,321 @@ define amdgpu_gs void @s_fptrunc_round_f32_to_f16_upward_multiple_calls(float in
   store half %res5, ptr addrspace(1) %out, align 4
   ret void
 }
+
+define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
+  ret <2 x half> %res
+}
+
+define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_downward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
+  ret <2 x half> %res
+}
+
+define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> %a, <2 x float> %b, ptr addrspace(1) %out) {
+; SDAG-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v3
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_pk_add_f16 v0, v0, v3
+; SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
+; SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
+; GISEL-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GISEL-NEXT:    v_pk_add_f16 v0, v0, v1
+; GISEL-NEXT:    v_pk_add_f16 v0, v2, v0
+; GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GISEL-NEXT:    s_endpgm
+  %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
+  %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
+  %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
+  %res4 = fadd <2 x half> %res1, %res2
+  %res5 = fadd <2 x half> %res3, %res4
+  store <2 x half> %res5, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> inreg %a, ptr addrspace(1) %out) {
+; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
+  %bitcast = bitcast <2 x half> %res to <2 x i16>
+  %ret = zext <2 x i16> %bitcast to <2 x i32>
+  ret <2 x i32> %ret
+}
+
+define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float> inreg %a, ptr addrspace(1) %out) {
+; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_downward:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s1, v1
+; CHECK-NEXT:    ; return to shader part epilog
+  %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
+  %bitcast = bitcast <2 x half> %res to <2 x i16>
+  %ret = zext <2 x i16> %bitcast to <2 x i32>
+  ret <2 x i32> %ret
+}
+
+define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
+; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v3, s2
+; CHECK-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-NEXT:    v_mov_b32_e32 v5, s3
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v3
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; CHECK-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
+; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; CHECK-NEXT:    v_pk_add_f16 v2, v2, v5
+; CHECK-NEXT:    v_pk_add_f16 v2, v3, v2
+; CHECK-NEXT:    global_store_dword v[0:1], v2, off
+; CHECK-NEXT:    s_endpgm
+  %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
+  %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
+  %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
+  %res4 = fadd <2 x half> %res1, %res2
+  %res5 = fadd <2 x half> %res3, %res4
+  store <2 x half> %res5, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FIXME
+; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) {
+;   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
+;   ret <3 x half> %res
+; }
+
+; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) {
+;   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
+;   ret <3 x half> %res
+; }
+
+define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
+  ret <4 x half> %res
+}
+
+define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v4f32_to_v4f16_downward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
+  ret <4 x half> %res
+}
+
+define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_upward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
+; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
+  ret <8 x half> %res
+}
+
+define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; SDAG-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v8f32_to_v8f16_downward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
+; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
+  ret <8 x half> %res
+}