[llvm] DAG: Fix widening of fptrunc_round vectors (PR #89918)

Wed Apr 24 05:26:02 PDT 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/89918

None

>From ad9a353d15bc4e79b9d6ff70d5ff7bb22c92222f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 24 Apr 2024 12:41:20 +0200
Subject: [PATCH] DAG: Fix widening of fptrunc_round vectors

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  8 ++-
 .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 53 +++++++++++++++----
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 0483f7c74f91a2..9c855e55855312 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -984,7 +984,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue WidenVecRes_XRINT(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
-  SDValue WidenVecRes_IS_FPCLASS(SDNode *N);
+  SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N);
   SDValue WidenVecRes_ExpOp(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
   SDValue WidenVecRes_InregOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 8776d89f4c5bd9..99ba4e3c7aa47f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4242,7 +4242,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     break;
 
   case ISD::IS_FPCLASS:
-    Res = WidenVecRes_IS_FPCLASS(N);
+  case ISD::FPTRUNC_ROUND:
+    Res = WidenVecRes_UnarySameEltsWithScalarArg(N);
     break;
 
   case ISD::FLDEXP:
@@ -5004,7 +5005,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
   return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_IS_FPCLASS(SDNode *N) {
+/// Result and first source operand are different scalar types, but must have
+/// the same number of elements. There is an additional control argument which
+/// should be passed thruogh unchanged.
+SDValue DAGTypeLegalizer::WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N) {
   SDValue FpValue = N->getOperand(0);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   if (getTypeAction(FpValue.getValueType()) != TargetLowering::TypeWidenVector)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 4526efc0d8fa4f..b8c16d2ed3b2f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -266,16 +266,51 @@ define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
   ret void
 }
 
-; FIXME
-; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) {
-;   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
-;   ret <3 x half> %res
-; }
+define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_upward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
+  ret <3 x half> %res
+}
 
-; define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) {
-;   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
-;   ret <3 x half> %res
-; }
+define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float> %a) {
+; SDAG-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: v_fptrunc_round_v3f32_to_v3f16_downward:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GISEL-NEXT:    ; return to shader part epilog
+  %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
+  ret <3 x half> %res
+}
 
 define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %a) {
 ; SDAG-LABEL: v_fptrunc_round_v4f32_to_v4f16_upward: