[llvm] DAG: Fix chain mismanagement in SoftenFloatRes_FP_EXTEND (PR #74406)

Tue Dec 5 20:21:50 PST 2023

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/74406

>From 405b8705782e50fd72bfd868f51df3111fe9f095 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 18:07:50 +0900
Subject: [PATCH 01/18] DAG: Implement promotion for strict_fpextend

Test is a placeholder, will be merged into the existing
test after additional bug fixes for illegal f16 targets are fixed.
---
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 23 ++++++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll   | 43 +++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 630aa4a07d7b9..f77b3afccfb8f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2214,6 +2214,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FP_TO_UINT_SAT:
                           R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
     case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+    case ISD::STRICT_FP_EXTEND:
+      R = PromoteFloatOp_STRICT_FP_EXTEND(N, OpNo);
+      break;
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
     case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
@@ -2276,6 +2279,26 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
+                                                          unsigned OpNo) {
+  assert(OpNo == 1);
+
+  SDValue Op = GetPromotedFloat(N->getOperand(1));
+  EVT VT = N->getValueType(0);
+
+  // Desired VT is same as promoted type.  Use promoted float directly.
+  if (VT == Op->getValueType(0)) {
+    ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+    return Op;
+  }
+
+  // Else, extend the promoted float value to the desired VT.
+  SDValue Res = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), N->getVTList(),
+                            N->getOperand(0), Op);
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 // Promote the float operands used for comparison.  The true- and false-
 // operands have the same type as the result and are promoted, if needed, by
 // PromoteFloatRes_SELECT_CC
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e9bd54089d062..4c7ddd4aea9e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -712,6 +712,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
new file mode 100644
index 0000000000000..0339fca4d56cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+
+define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load half, ptr addrspace(1) %ptr
+  %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x half>, ptr addrspace(1) %ptr
+  %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+attributes #0 = { strictfp }

>From f381fe1f2b6586e4e2bb23eeac5b16bd5622e385 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Dec 2023 17:20:26 +0700
Subject: [PATCH 02/18] DAG: Implement promotion for strict_fp_round

Needs an AMDGPU hack to get the selection to work. The ordinary
variant is custom lowered through an almost equivalent target node
that would need a strict variant for additional known bits optimizations.
---
 .../include/llvm/Target/TargetSelectionDAG.td | 13 ++++
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 40 +++++++++++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 15 ++++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  9 ++-
 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll   | 67 +++++++++++++++++++
 6 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 798e6a1d9525e..de7bf26868b34 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -614,6 +614,12 @@ def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
 def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP",
                                SDTIntToFPOp, [SDNPHasChain]>;
+
+def strict_f16_to_fp  : SDNode<"ISD::STRICT_FP16_TO_FP",
+                               SDTIntToFPOp, [SDNPHasChain]>;
+def strict_fp_to_f16  : SDNode<"ISD::STRICT_FP_TO_FP16",
+                               SDTFPToIntOp, [SDNPHasChain]>;
+
 def strict_fsetcc  : SDNode<"ISD::STRICT_FSETCC",  SDTSetCC, [SDNPHasChain]>;
 def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>;
 
@@ -1558,6 +1564,13 @@ def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
                           [(strict_fsetccs node:$lhs, node:$rhs, node:$pred),
                            (setcc node:$lhs, node:$rhs, node:$pred)]>;
 
+def any_f16_to_fp : PatFrags<(ops node:$src),
+                              [(f16_to_fp node:$src),
+                               (strict_f16_to_fp node:$src)]>;
+def any_fp_to_f16 : PatFrags<(ops node:$src),
+                              [(fp_to_f16 node:$src),
+                               (strict_fp_to_f16 node:$src)]>;
+
 multiclass binary_atomic_op_ord {
   def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
       (!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f77b3afccfb8f..d7a688511b726 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2181,6 +2181,20 @@ static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
   report_fatal_error("Attempt at an invalid promotion-related conversion");
 }
 
+static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    return ISD::STRICT_FP16_TO_FP;
+  } else if (RetVT == MVT::f16) {
+    return ISD::STRICT_FP_TO_FP16;
+  } else if (OpVT == MVT::bf16) {
+    // return ISD::STRICT_BF16_TO_FP;
+  } else if (RetVT == MVT::bf16) {
+    // return ISD::STRICT_FP_TO_BF16;
+  }
+
+  report_fatal_error("Attempt at an invalid promotion-related conversion");
+}
+
 bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Promote float operand " << OpNo << ": "; N->dump(&DAG));
   SDValue R = SDValue();
@@ -2416,6 +2430,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FFREXP:     R = PromoteFloatRes_FFREXP(N); break;
 
     case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
+    case ISD::STRICT_FP_ROUND:
+      R = PromoteFloatRes_STRICT_FP_ROUND(N);
+      break;
     case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
@@ -2621,6 +2638,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
 }
 
+// Explicit operation to reduce precision.  Reduce the value to half precision
+// and promote it back to the legal type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_FP_ROUND(SDNode *N) {
+  SDLoc DL(N);
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = Op->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  // Round promoted float to desired precision
+  SDValue Round = DAG.getNode(GetPromotionOpcodeStrict(OpVT, VT), DL,
+                              DAG.getVTList(IVT, MVT::Other), Chain, Op);
+  // Promote it back to the legal output type
+  SDValue Res =
+      DAG.getNode(GetPromotionOpcodeStrict(VT, NVT), DL,
+                  DAG.getVTList(NVT, MVT::Other), Round.getValue(1), Round);
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 54698edce7d6f..887670fb6baff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -165,7 +165,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_FP16:
     Res = PromoteIntRes_FP_TO_FP16_BF16(N);
     break;
-
+  case ISD::STRICT_FP_TO_FP16:
+    Res = PromoteIntRes_STRICT_FP_TO_FP16_BF16(N);
+    break;
   case ISD::GET_ROUNDING: Res = PromoteIntRes_GET_ROUNDING(N); break;
 
   case ISD::AND:
@@ -787,6 +789,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16_BF16(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other),
+                            N->getOperand(0), N->getOperand(1));
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_XRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
@@ -1804,6 +1816,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP16_TO_FP:
   case ISD::VP_UINT_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::STRICT_FP16_TO_FP:
   case ISD::STRICT_UINT_TO_FP:  Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
   case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4c7ddd4aea9e6..9361e7fff2190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -326,6 +326,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_FP16_BF16(SDNode *N);
+  SDValue PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N);
   SDValue PromoteIntRes_XRINT(SDNode *N);
   SDValue PromoteIntRes_FREEZE(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
@@ -698,6 +699,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatRes_ExpOp(SDNode *N);
   SDValue PromoteFloatRes_FFREXP(SDNode *N);
   SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
+  SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b..d23d6b4f93da8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1097,7 +1097,7 @@ def : Pat <
 multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
   // f16_to_fp patterns
   def : GCNPat <
-    (f32 (f16_to_fp i32:$src0)),
+    (f32 (any_f16_to_fp i32:$src0)),
     (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)
   >;
 
@@ -1151,6 +1151,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (f16 (uint_to_fp i32:$src)),
     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
   >;
+
+  // This is only used on targets without half support
+  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
+  def : GCNPat <
+    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
+  >;
 }
 
 let SubtargetPredicate = NotHasTrue16BitInsts in
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
index 0339fca4d56cf..8a3647a9b6e93 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -3,6 +3,9 @@
 
 declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
 declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare float @llvm.fabs.f32(float)
 
 define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
@@ -40,4 +43,68 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0
   ret <2 x float> %result
 }
 
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x half> %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %neg.arg = fneg float %arg
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %abs.arg = call float @llvm.fabs.f32(float %arg)
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
 attributes #0 = { strictfp }

>From c65c6e98d79f227ccdb659ca0ea2330e968b982f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 20:49:51 +0700
Subject: [PATCH 03/18] DAG: Fix ABI lowering with FP promote in strictfp
 functions

This was emitting non-strict casts in ABI contexts for illegal
types.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  59 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     |   3 +
 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll   | 110 ----
 llvm/test/CodeGen/AMDGPU/strict_fpext.ll      | 280 ++++++++-
 llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll    | 248 +++++++-
 .../AMDGPU/strictfp_f16_abi_promote.ll        | 558 ++++++++++++++++++
 6 files changed, 1079 insertions(+), 179 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index da7d9ace4114a..29684d3372bdb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -153,6 +153,7 @@ static const unsigned MaxParallelChains = 64;
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V,
+                                      SDValue InChain,
                                       std::optional<CallingConv::ID> CC);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
@@ -163,6 +164,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 static SDValue
 getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
                  unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V,
+                 SDValue InChain,
                  std::optional<CallingConv::ID> CC = std::nullopt,
                  std::optional<ISD::NodeType> AssertOp = std::nullopt) {
   // Let the target assemble the parts if it wants to
@@ -173,7 +175,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
 
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
-                                  CC);
+                                  InChain, CC);
 
   assert(NumParts > 0 && "No parts to assemble!");
   SDValue Val = Parts[0];
@@ -194,10 +196,10 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
       EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
 
       if (RoundParts > 2) {
-        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
-                              PartVT, HalfVT, V);
-        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
-                              RoundParts / 2, PartVT, HalfVT, V);
+        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, PartVT, HalfVT, V,
+                              InChain);
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, RoundParts / 2,
+                              PartVT, HalfVT, V, InChain);
       } else {
         Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
         Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
@@ -213,7 +215,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
         unsigned OddParts = NumParts - RoundParts;
         EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
         Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
-                              OddVT, V, CC);
+                              OddVT, V, InChain, CC);
 
         // Combine the round and odd parts.
         Lo = Val;
@@ -243,7 +245,8 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
       assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
              !PartVT.isVector() && "Unexpected split");
       EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
-      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V,
+                             InChain, CC);
     }
   }
 
@@ -283,10 +286,20 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
 
   if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
     // FP_ROUND's are always exact here.
-    if (ValueVT.bitsLT(Val.getValueType()))
-      return DAG.getNode(
-          ISD::FP_ROUND, DL, ValueVT, Val,
-          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
+    if (ValueVT.bitsLT(Val.getValueType())) {
+
+      SDValue NoChange =
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+
+      if (DAG.getMachineFunction().getFunction().getAttributes().hasFnAttr(
+              llvm::Attribute::StrictFP)) {
+        return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
+                           DAG.getVTList(ValueVT, MVT::Other), InChain, Val,
+                           NoChange);
+      }
+
+      return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val, NoChange);
+    }
 
     return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
   }
@@ -324,6 +337,7 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V,
+                                      SDValue InChain,
                                       std::optional<CallingConv::ID> CallConv) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
@@ -362,8 +376,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       // If the register was not expanded, truncate or copy the value,
       // as appropriate.
       for (unsigned i = 0; i != NumParts; ++i)
-        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
-                                  PartVT, IntermediateVT, V, CallConv);
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, PartVT, IntermediateVT,
+                                  V, InChain, CallConv);
     } else if (NumParts > 0) {
       // If the intermediate type was expanded, build the intermediate
       // operands from the parts.
@@ -371,8 +385,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
              "Must expand into a divisible number of parts!");
       unsigned Factor = NumParts / NumIntermediates;
       for (unsigned i = 0; i != NumIntermediates; ++i)
-        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
-                                  PartVT, IntermediateVT, V, CallConv);
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, PartVT,
+                                  IntermediateVT, V, InChain, CallConv);
     }
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
@@ -926,7 +940,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
     }
 
     Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
-                                     RegisterVT, ValueVT, V, CallConv);
+                                     RegisterVT, ValueVT, V, Chain, CallConv);
     Part += NumRegs;
     Parts.clear();
   }
@@ -10641,9 +10655,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
                                                        CLI.CallConv, VT);
 
-      ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
-                                              NumRegs, RegisterVT, VT, nullptr,
-                                              CLI.CallConv, AssertOp));
+      ReturnValues.push_back(getCopyFromParts(
+          CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr,
+          CLI.Chain, CLI.CallConv, AssertOp));
       CurReg += NumRegs;
     }
 
@@ -11122,8 +11136,9 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     MVT VT = ValueVTs[0].getSimpleVT();
     MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
     std::optional<ISD::NodeType> AssertOp;
-    SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
-                                        nullptr, F.getCallingConv(), AssertOp);
+    SDValue ArgValue =
+        getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, nullptr, NewRoot,
+                         F.getCallingConv(), AssertOp);
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
@@ -11195,7 +11210,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           AssertOp = ISD::AssertZext;
 
         ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
-                                             PartVT, VT, nullptr,
+                                             PartVT, VT, nullptr, NewRoot,
                                              F.getCallingConv(), AssertOp));
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index d74948a460c98..20d175ecfd909 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1316,6 +1316,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
deleted file mode 100644
index 8a3647a9b6e93..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s
-
-declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
-declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
-declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
-declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
-declare float @llvm.fabs.f32(float)
-
-define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %val = load half, ptr addrspace(1) %ptr
-  %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
-  ret float %result
-}
-
-define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x half>, ptr addrspace(1) %ptr
-  %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
-  ret <2 x float> %result
-}
-
-define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg, ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret void
-}
-
-define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg, ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  store <2 x half> %result, ptr addrspace(1) %ptr
-  ret void
-}
-
-define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg(float %arg, ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %neg.arg = fneg float %arg
-  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret void
-}
-
-define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs(float %arg, ptr addrspace(1) %ptr) #0 {
-; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-  %abs.arg = call float @llvm.fabs.f32(float %arg)
-  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret void
-}
-
-attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 22bebb7ad26f5..fe59a8491c91a 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -1,22 +1,47 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; FIXME: Missing operand promote for f16
-; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
 
 define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
-; GCN-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   ret float %result
 }
 
 define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half> %arg) #0 {
+; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45,6 +70,20 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half
 }
 
 define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half> %arg) #0 {
+; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,6 +140,16 @@ define <2 x double> @v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict(<2 x flo
 }
 
 define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x float> %arg) #0 {
+; SI-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v1
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX89-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -125,17 +174,46 @@ define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x flo
 }
 
 define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 {
-; GCN-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1011-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.fpext.f64.f16(half %arg, metadata !"fpexcept.strict")
   ret double %result
 }
 
 define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x half> %arg) #0 {
+; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -168,6 +246,23 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal
 }
 
 define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x half> %arg) #0 {
+; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; SI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -206,23 +301,54 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal
 }
 
 define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
-; GCN-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX89-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1011-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   %neg.result = fneg float %result
   ret float %neg.result
 }
 
 define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 {
-; GCN-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg half %arg
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %neg.arg, metadata !"fpexcept.strict")
   ret float %result
@@ -251,6 +377,111 @@ define double @v_constrained_fneg_fpext_f32_to_f64_fpexcept_strict(float %arg) #
   ret double %neg.result
 }
 
+define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s6
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load half, ptr addrspace(1) %ptr
+  %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s6
+; SI-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX8-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x half>, ptr addrspace(1) %ptr
+  %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) #1
 declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata) #1
 declare <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(<3 x float>, metadata) #1
@@ -265,6 +496,3 @@ declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>,
 
 attributes #0 = { strictfp }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX8: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
index ec2bc43ca3678..965040d0d879c 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
@@ -1,21 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
 
 define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
-; GCN-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
 }
 
 define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg) #0 {
+; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -45,6 +70,20 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo
 }
 
 define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x float> %arg) #0 {
+; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX8-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -126,23 +165,53 @@ define <3 x float> @v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict(<3 x do
 ; }
 
 define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
-; GCN-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX1011-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   %neg.val = fneg half %val
   ret half %neg.val
 }
 
 define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #0 {
-; GCN-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg float %arg
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -171,6 +240,145 @@ define float @v_constrained_fptrunc_fneg_f64_to_f32_fpexcept_strict(double %arg)
   ret float %val
 }
 
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x float> %arg, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_mov_b32 s5, s6
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x half> %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float %arg, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %neg.arg = fneg float %arg
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float %arg, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cvt_f16_f32_e64 v0, |v0|
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %abs.arg = call float @llvm.fabs.f32(float %arg)
+  %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret void
+}
+
 declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #1
 declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #1
 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #1
@@ -183,9 +391,7 @@ declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, me
 declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(<2 x double>, metadata, metadata) #1
 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f64(<3 x double>, metadata, metadata) #1
 
+declare float @llvm.fabs.f32(float) #1
+
 attributes #0 = { strictfp }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
-; GFX11: {{.*}}
-; GFX89: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
new file mode 100644
index 0000000000000..57e4cec4eccb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -0,0 +1,558 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+
+declare void @f16_user(half)
+declare half @f16_result()
+
+declare void @v2f16_user(<2 x half>)
+declare <2 x half> @v2f16_result()
+
+declare void @v4f16_user(<4 x half>)
+declare <4 x half> @v4f16_result()
+
+declare void @v8f16_user(<8 x half>)
+declare <8 x half> @v8f16_result()
+
+define void @f16_arg(half %arg, ptr %ptr) #0 {
+; GFX7-LABEL: f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    flat_store_dword v[1:2], v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
+  store float %fpext, ptr %ptr
+  ret void
+}
+
+define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
+; GFX7-LABEL: v2f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v2
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v5
+; GFX7-NEXT:    flat_store_dword v[2:3], v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
+  store <2 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
+; GFX7-LABEL: v3f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v3
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v3
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v6
+; GFX7-NEXT:    flat_store_dword v[3:4], v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
+  store <3 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
+; GFX7-LABEL: v4f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v4
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v4
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v7
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v4
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v3
+; GFX7-NEXT:    flat_store_dword v[4:5], v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
+  store <4 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define half @f16_return(float %arg) #0 {
+; GFX7-LABEL: f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret half %fptrunc
+}
+
+define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; GFX7-LABEL: v2f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <2 x half> %fptrunc
+}
+
+define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; GFX7-LABEL: v3f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <3 x half> %fptrunc
+}
+
+define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; GFX7-LABEL: v4f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <4 x half> %fptrunc
+}
+
+define void @outgoing_f16_arg(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, f16_user at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, f16_user at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load half, ptr %ptr
+  call void @f16_user(half %val)
+  ret void
+}
+
+define void @outgoing_v2f16_arg(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_v2f16_arg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, v2f16_user at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, v2f16_user at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x half>, ptr %ptr
+  call void @v2f16_user(<2 x half> %val)
+  ret void
+}
+
+define void @outgoing_f16_return(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, f16_result at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, f16_result at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v42, v1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    flat_store_short v[41:42], v0
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = call half @f16_result()
+  store half %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v2f16_return(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_v2f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, v2f16_result at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, v2f16_result at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v42, v1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    flat_store_dword v[41:42], v0
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <2 x half> @v2f16_result()
+  store <2 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v4f16_return(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_v4f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, v4f16_result at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, v4f16_result at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v42, v1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    flat_store_dword v[41:42], v4
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <4 x half> @v4f16_result()
+  store <4 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v8f16_return(ptr %ptr) #0 {
+; GFX7-LABEL: outgoing_v8f16_return:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, v8f16_result at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, v8f16_result at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v42, v1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v8, v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v6
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v4, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v41
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v3
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v41
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v5
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    flat_store_dword v[41:42], v8
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <8 x half> @v8f16_result()
+  store <8 x half> %val, ptr %ptr
+  ret void
+}
+
+define half @call_split_type_used_outside_block_v8f16() #0 {
+; GFX7-LABEL: call_split_type_used_outside_block_v8f16:
+; GFX7:       ; %bb.0: ; %bb0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s16, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    s_mov_b32 s17, v8f16_result at abs32@hi
+; GFX7-NEXT:    s_mov_b32 s16, v8f16_result at abs32@lo
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+bb0:
+  %split.ret.type = call <8 x half> @v8f16_result()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <8 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
+
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
+
+attributes #0 = { strictfp }

>From ba7286dd6c9aa65edbd7e9be24f5489a6c7b4440 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 5 Dec 2023 09:57:59 +0700
Subject: [PATCH 04/18] DAG: Fix chain mismanagement in
 SoftenFloatRes_FP_EXTEND

This would result in nodes not getting appropriately re-legalized
in the strictfp case.
---
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   9 +-
 .../AArch64/strictfp_f16_abi_promote.ll       | 344 ++++++++++++++++++
 2 files changed, 351 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index d7a688511b726..12956cf874751 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -522,8 +522,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
     Op = GetPromotedFloat(Op);
     // If the promotion did the FP_EXTEND to the destination type for us,
     // there's nothing left to do here.
-    if (Op.getValueType() == N->getValueType(0))
+    if (Op.getValueType() == N->getValueType(0)) {
+      if (IsStrict)
+        ReplaceValueWith(SDValue(N, 1), Chain);
       return BitConvertToInteger(Op);
+    }
   }
 
   // There's only a libcall for f16 -> f32 and shifting is only valid for bf16
@@ -541,8 +544,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
     }
   }
 
-  if (Op.getValueType() == MVT::bf16)
+  if (Op.getValueType() == MVT::bf16) {
+    // FIXME: Need ReplaceValueWith on chain in strict case
     return SoftenFloatRes_BF16_TO_FP(N);
+  }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
new file mode 100644
index 0000000000000..37186cf22ccc7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck -check-prefix=NOFP16 %s
+
+declare void @f16_user(half)
+declare half @f16_result()
+
+declare void @v2f16_user(<2 x half>)
+declare <2 x half> @v2f16_result()
+
+declare void @v4f16_user(<4 x half>)
+declare <4 x half> @v4f16_result()
+
+declare void @v8f16_user(<8 x half>)
+declare <8 x half> @v8f16_result()
+
+define void @f16_arg(half %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: f16_arg:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w30, -16
+; NOFP16-NEXT:    and w0, w0, #0xffff
+; NOFP16-NEXT:    mov x19, x1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    str w0, [x19]
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
+  store float %fpext, ptr %ptr
+  ret void
+}
+
+define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v2f16_arg:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    and w0, w0, #0xffff
+; NOFP16-NEXT:    mov x19, x2
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    stp w21, w0, [x19]
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
+  store <2 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v3f16_arg:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w2, #0xffff
+; NOFP16-NEXT:    mov x19, x3
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    and w0, w21, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w8, w21
+; NOFP16-NEXT:    // kill: def $w0 killed $w0 def $x0
+; NOFP16-NEXT:    str w22, [x19, #8]
+; NOFP16-NEXT:    orr x8, x8, x0, lsl #32
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    str x8, [x19]
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
+  store <3 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
+; NOFP16-LABEL: v4f16_arg:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w23, -40
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    and w0, w0, #0xffff
+; NOFP16-NEXT:    mov x19, x4
+; NOFP16-NEXT:    mov w20, w3
+; NOFP16-NEXT:    mov w21, w2
+; NOFP16-NEXT:    mov w22, w1
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w23, w0
+; NOFP16-NEXT:    and w0, w22, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    and w0, w21, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    stp w21, w0, [x19, #8]
+; NOFP16-NEXT:    stp w23, w22, [x19]
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
+  store <4 x float> %fpext, ptr %ptr
+  ret void
+}
+
+; FIXME:
+; define half @f16_return(float %arg) #0 {
+;   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;   ret half %fptrunc
+; }
+
+; define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+;   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;   ret <2 x half> %fptrunc
+; }
+
+; define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+;   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;   ret <3 x half> %fptrunc
+; }
+
+; define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+;   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;   ret <4 x half> %fptrunc
+; }
+
+; FIXME:
+; define void @outgoing_f16_arg(ptr %ptr) #0 {
+;   %val = load half, ptr %ptr
+;   call void @f16_user(half %val)
+;   ret void
+; }
+
+; define void @outgoing_v2f16_arg(ptr %ptr) #0 {
+;   %val = load <2 x half>, ptr %ptr
+;   call void @v2f16_user(<2 x half> %val)
+;   ret void
+; }
+
+; define void @outgoing_f16_return(ptr %ptr) #0 {
+;   %val = call half @f16_result()
+;   store half %val, ptr %ptr
+;   ret void
+; }
+
+; define void @outgoing_v2f16_return(ptr %ptr) #0 {
+;   %val = call <2 x half> @v2f16_result()
+;   store <2 x half> %val, ptr %ptr
+;   ret void
+; }
+
+define void @outgoing_v4f16_return(ptr %ptr) #0 {
+; NOFP16-LABEL: outgoing_v4f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w23, -40
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    mov x19, x0
+; NOFP16-NEXT:    bl v4f16_result
+; NOFP16-NEXT:    and w0, w0, #0xffff
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    mov w21, w2
+; NOFP16-NEXT:    mov w22, w3
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w23, w0
+; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    and w0, w21, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w22, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #6]
+; NOFP16-NEXT:    mov w0, w21
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #4]
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #2]
+; NOFP16-NEXT:    mov w0, w23
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19]
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %val = call <4 x half> @v4f16_result()
+  store <4 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v8f16_return(ptr %ptr) #0 {
+; NOFP16-LABEL: outgoing_v8f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w23, -40
+; NOFP16-NEXT:    .cfi_offset w24, -48
+; NOFP16-NEXT:    .cfi_offset w25, -56
+; NOFP16-NEXT:    .cfi_offset w26, -64
+; NOFP16-NEXT:    .cfi_offset w27, -72
+; NOFP16-NEXT:    .cfi_offset w30, -80
+; NOFP16-NEXT:    mov x19, x0
+; NOFP16-NEXT:    bl v8f16_result
+; NOFP16-NEXT:    and w0, w0, #0xffff
+; NOFP16-NEXT:    mov w21, w1
+; NOFP16-NEXT:    mov w22, w2
+; NOFP16-NEXT:    mov w23, w3
+; NOFP16-NEXT:    mov w24, w4
+; NOFP16-NEXT:    mov w25, w5
+; NOFP16-NEXT:    mov w26, w6
+; NOFP16-NEXT:    mov w27, w7
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    and w0, w21, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    and w0, w22, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    and w0, w23, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w23, w0
+; NOFP16-NEXT:    and w0, w24, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w24, w0
+; NOFP16-NEXT:    and w0, w25, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w25, w0
+; NOFP16-NEXT:    and w0, w26, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    mov w26, w0
+; NOFP16-NEXT:    and w0, w27, #0xffff
+; NOFP16-NEXT:    bl __gnu_h2f_ieee
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #14]
+; NOFP16-NEXT:    mov w0, w26
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #12]
+; NOFP16-NEXT:    mov w0, w25
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #10]
+; NOFP16-NEXT:    mov w0, w24
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #8]
+; NOFP16-NEXT:    mov w0, w23
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #6]
+; NOFP16-NEXT:    mov w0, w22
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #4]
+; NOFP16-NEXT:    mov w0, w21
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19, #2]
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w0, [x19]
+; NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+  %val = call <8 x half> @v8f16_result()
+  store <8 x half> %val, ptr %ptr
+  ret void
+}
+
+define half @call_split_type_used_outside_block_v8f16() #0 {
+; NOFP16-LABEL: call_split_type_used_outside_block_v8f16:
+; NOFP16:       // %bb.0: // %bb0
+; NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; NOFP16-NEXT:    .cfi_offset w30, -16
+; NOFP16-NEXT:    bl v8f16_result
+; NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+bb0:
+  %split.ret.type = call <8 x half> @v8f16_result()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <8 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
+
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
+
+attributes #0 = { strictfp }

>From 91cdd376f0aa3fe4c1dc697ec0c4310235c3747f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 5 Dec 2023 09:54:52 +0700
Subject: [PATCH 05/18] XXX - add ARM test

---
 .../CodeGen/ARM/strictfp_f16_abi_promote.ll   | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll

diff --git a/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll
new file mode 100644
index 0000000000000..2da8ea66a0b95
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+v8.2a,+neon,-fullfp16 -float-abi=hard < %s | FileCheck -check-prefix=NOFP16 %s
+
+declare void @f16_user(half)
+declare half @f16_result()
+
+declare void @v2f16_user(<2 x half>)
+declare <2 x half> @v2f16_result()
+
+declare void @v4f16_user(<4 x half>)
+declare <4 x half> @v4f16_result()
+
+declare void @v8f16_user(<8 x half>)
+declare <8 x half> @v8f16_result()
+
+define void @f16_arg(half %arg, ptr %ptr) #0 {
+  %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
+  store float %fpext, ptr %ptr
+  ret void
+}
+
+define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 {
+  %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
+  store <2 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
+  %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
+  store <3 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
+  %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict")
+  store <4 x float> %fpext, ptr %ptr
+  ret void
+}
+
+define half @f16_return(float %arg) #0 {
+  %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret half %fptrunc
+}
+
+define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+  %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <2 x half> %fptrunc
+}
+
+define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+  %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <3 x half> %fptrunc
+}
+
+define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+  %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret <4 x half> %fptrunc
+}
+
+define void @outgoing_f16_arg(ptr %ptr) #0 {
+  %val = load half, ptr %ptr
+  call void @f16_user(half %val)
+  ret void
+}
+
+define void @outgoing_v2f16_arg(ptr %ptr) #0 {
+  %val = load <2 x half>, ptr %ptr
+  call void @v2f16_user(<2 x half> %val)
+  ret void
+}
+
+define void @outgoing_f16_return(ptr %ptr) #0 {
+  %val = call half @f16_result()
+  store half %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v2f16_return(ptr %ptr) #0 {
+  %val = call <2 x half> @v2f16_result()
+  store <2 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v4f16_return(ptr %ptr) #0 {
+  %val = call <4 x half> @v4f16_result()
+  store <4 x half> %val, ptr %ptr
+  ret void
+}
+
+define void @outgoing_v8f16_return(ptr %ptr) #0 {
+  %val = call <8 x half> @v8f16_result()
+  store <8 x half> %val, ptr %ptr
+  ret void
+}
+
+define half @call_split_type_used_outside_block_v8f16() #0 {
+bb0:
+  %split.ret.type = call <8 x half> @v8f16_result()
+  br label %bb1
+
+bb1:
+  %extract = extractelement <8 x half> %split.ret.type, i32 0
+  ret half %extract
+}
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0
+
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0
+
+attributes #0 = { strictfp }

>From 72e2cf7ba1b78d8d70b528252d33d7466fd163a1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Dec 2023 18:16:20 +0700
Subject: [PATCH 06/18] AMDGPU: Improve handling of strictfp fp_to_fp16

---
 llvm/lib/Target/AMDGPU/SIInstructions.td   | 8 ++++----
 llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d23d6b4f93da8..4e01f17349c2c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1102,22 +1102,22 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+    (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
     (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0)
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+    (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
     (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+    (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))),
     (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0)
   >;
 
   def : GCNPat <
-    (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+    (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
     (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
   >;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 53b0513c85d88..f398d91b59091 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -509,7 +509,7 @@ defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
 
 let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
 def : GCNPat<
-    (f32 (f16_to_fp i16:$src)),
+    (f32 (any_f16_to_fp i16:$src)),
     (V_CVT_F32_F16_e32 $src)
 >;
 def : GCNPat<
@@ -519,7 +519,7 @@ def : GCNPat<
 }
 let OtherPredicates = [HasTrue16BitInsts] in {
 def : GCNPat<
-    (f32 (f16_to_fp i16:$src)),
+    (f32 (any_f16_to_fp i16:$src)),
     (V_CVT_F32_F16_t16_e32 $src)
 >;
 def : GCNPat<

>From dd7d90d432e7a1de50107e4e5e0d6ab0a88a57b9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 25 Jun 2023 19:28:18 -0400
Subject: [PATCH 07/18] XXX - AMDGPU: Handle strict sitofp

---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td |   8 +-
 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll  | 258 +++++++++++++++++++++
 2 files changed, 262 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f398d91b59091..72d8f97991729 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -262,7 +262,7 @@ let SchedRW = [WriteDoubleCvt] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD,  fp_to_sint>;
 
 let mayRaiseFPException = 0 in {
-defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, any_sint_to_fp>;
 }
 
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64,  fpround>;
@@ -281,7 +281,7 @@ let SchedRW = [WriteFloatCvt] in {
 // XXX: Does this really not raise exceptions? The manual claims the
 // 16-bit ones can.
 let mayRaiseFPException = 0 in {
-defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, any_sint_to_fp>;
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 }
 
@@ -467,11 +467,11 @@ let SubtargetPredicate = isGFX7Plus in {
 let FPDPRounding = 1 in {
 let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, any_sint_to_fp>;
 }
 let OtherPredicates = [HasTrue16BitInsts] in {
 defm V_CVT_F16_U16_t16 : VOP1Inst <"v_cvt_f16_u16_t16", VOP1_F16_I16_t16, uint_to_fp>;
-defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, sint_to_fp>;
+defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, any_sint_to_fp>;
 }
 } // End FPDPRounding = 1
 // OMod clears exceptions when set in these two instructions
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
new file mode 100644
index 0000000000000..bb79071c48021
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; FIXME: Missing operand promote for f16
+; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i16_to_v2f16_fpexcept_strict(<2 x i16> %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i16_to_v3f16_fpexcept_strict(<3 x i16> %arg) #0 {
+  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i16_to_f32_fpexcept_strict(i16 %arg) #0 {
+  %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i16_to_v2f32_fpexcept_strict(<2 x i16> %arg) #0 {
+  %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i16_to_v3f32_fpexcept_strict(<3 x i16> %arg) #0 {
+  %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i16_to_f64_fpexcept_strict(i16 %arg) #0 {
+  %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i16_to_v2f64_fpexcept_strict(<2 x i16> %arg) #0 {
+  %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i16_to_v3f64_fpexcept_strict(<3 x i16> %arg) #0 {
+  %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i16(i16, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16>, metadata, metadata) #1
+
+
+
+
+
+
+define half @v_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i32_to_v3f16_fpexcept_strict(<3 x i32> %arg) #0 {
+  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
+  %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
+  %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i32_to_v3f32_fpexcept_strict(<3 x i32> %arg) #0 {
+  %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i32_to_f64_fpexcept_strict(i32 %arg) #0 {
+  %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i32_to_v2f64_fpexcept_strict(<2 x i32> %arg) #0 {
+  %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i32_to_v3f64_fpexcept_strict(<3 x i32> %arg) #0 {
+  %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) #1
+
+
+
+
+
+
+attributes #0 = { strictfp }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX8: {{.*}}
+; GFX9: {{.*}}
+
+
+
+
+
+
+
+define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {
+  %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i64_to_v2f32_fpexcept_strict(<2 x i64> %arg) #0 {
+  %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i64_to_v3f32_fpexcept_strict(<3 x i64> %arg) #0 {
+  %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i64_to_f64_fpexcept_strict(i64 %arg) #0 {
+  %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i64_to_v2f64_fpexcept_strict(<2 x i64> %arg) #0 {
+  %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i64_to_v3f64_fpexcept_strict(<3 x i64> %arg) #0 {
+  %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) #1
+
+define half @v_constrained_sitofp_i8_to_f16_fpexcept_strict(i8 %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i8_to_v2f16_fpexcept_strict(<2 x i8> %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i8_to_v3f16_fpexcept_strict(<3 x i8> %arg) #0 {
+  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i8_to_f32_fpexcept_strict(i8 %arg) #0 {
+  %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i8_to_v2f32_fpexcept_strict(<2 x i8> %arg) #0 {
+  %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i8_to_v3f32_fpexcept_strict(<3 x i8> %arg) #0 {
+  %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i8_to_f64_fpexcept_strict(i8 %arg) #0 {
+  %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i8_to_v2f64_fpexcept_strict(<2 x i8> %arg) #0 {
+  %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i8_to_v3f64_fpexcept_strict(<3 x i8> %arg) #0 {
+  %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i8(i8, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i8(i8, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8>, metadata, metadata) #1

>From de821014c6a8c8a12dd5a405c1317587fb237194 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 14:59:47 +0900
Subject: [PATCH 08/18] XXX - strict sint to fp

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1f..3b59ac1156054 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1331,7 +1331,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
     return lowerFEXP(Op, DAG);
   case ISD::FEXP2:
     return lowerFEXP2(Op, DAG);
-  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+    return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
   case ISD::FP_TO_SINT:
@@ -3281,8 +3283,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT DestVT = Op.getValueType();
-
-  SDValue Src = Op.getOperand(0);
+  bool IsStrict = Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = Src.getValueType();
 
   if (SrcVT == MVT::i16) {
@@ -3292,6 +3294,8 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
     SDLoc DL(Op);
     // Promote src to i32
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, Op->getVTList(), Op.getOperand(0), Ext);
     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 53ab5da013539..bdf4b1c6213fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -539,7 +539,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
                        MVT::f16, Custom);
 
-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP}, MVT::i16, Custom);
 
     setOperationAction(
         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},

>From 2667dba20a2329cde39e0735223662f4be99e548 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:34 +0900
Subject: [PATCH 09/18] Hacky

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3b59ac1156054..8ea6eb71db218 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -421,7 +421,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
   setOperationAction(
-      {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+    {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
       MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
@@ -442,7 +442,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                         ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
                         ISD::MULHS,      ISD::OR,      ISD::SHL,
                         ISD::SRA,        ISD::SRL,     ISD::ROTL,
-                        ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
+                        ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP,
                         ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
                         ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
                         ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bdf4b1c6213fa..db329e3cbe3a3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -542,9 +542,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP}, MVT::i16, Custom);
 
     setOperationAction(
-        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+         ISD::UINT_TO_FP},
         MVT::f16, Promote);
 
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
+
     // F16 - VOP2 Actions.
     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);

>From 873defcf65bfe8acdc474c0a73be89c32b9374fc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:48 +0900
Subject: [PATCH 10/18] worky

---
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  2 +-
 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 24 ++++++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4e01f17349c2c..581c23cede7d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1143,7 +1143,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
   >;
 
   def : GCNPat <
-    (f16 (sint_to_fp i32:$src)),
+    (f16 (any_sint_to_fp i32:$src)),
     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src))
   >;
 
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index bb79071c48021..b61b38b6183c8 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -4,7 +4,10 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1100 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1150 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1150 %s
+
+
 
 define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
   %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -88,7 +91,7 @@ define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
   ret float %result
 }
 
-define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(a<2 x i32> %arg) #0 {
   %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %result
 }
@@ -125,10 +128,25 @@ declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, meta
 declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) #1
 declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) #1
 
+define half @s_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 inreg %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
 
+define float @s_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 inreg %arg) #0 {
+  %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
 
+define <2 x half> @s_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> inreg %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
 
-
+define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> inreg %arg) #0 {
+  %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
 
 attributes #0 = { strictfp }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }

>From 0ef481559fc174bc248153271cb46995b5355ddd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:58 +0900
Subject: [PATCH 11/18] todo

---
 llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9e10efd1b07e1..763978178e6d6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -413,7 +413,7 @@ class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
 
 let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
     SchedRW = [WriteSFPU], isReMaterializable = 1 in {
-  def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>;
+  def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>; // xxx - any
   def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>;
 
   let mayRaiseFPException = 1 in {

>From 5378720b763cf0935c9fe95f019fd26f374eff37 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:31:39 +0900
Subject: [PATCH 12/18] test

---
 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 31 +++++++----------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index b61b38b6183c8..a4138f8715851 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -7,8 +7,6 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1100 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1150 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1150 %s
 
-
-
 define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
   %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %result
@@ -66,11 +64,6 @@ declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, meta
 declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16>, metadata, metadata) #1
 declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16>, metadata, metadata) #1
 
-
-
-
-
-
 define half @v_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 %arg) #0 {
   %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %result
@@ -91,7 +84,7 @@ define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
   ret float %result
 }
 
-define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(a<2 x i32> %arg) #0 {
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
   %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %result
 }
@@ -135,7 +128,7 @@ define half @s_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 inreg %arg) #0
 
 define float @s_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 inreg %arg) #0 {
   %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  ret half %result
+  ret float %result
 }
 
 define <2 x half> @s_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> inreg %arg) #0 {
@@ -148,19 +141,6 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
   ret <2 x float> %result
 }
 
-attributes #0 = { strictfp }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GFX8: {{.*}}
-; GFX9: {{.*}}
-
-
-
-
-
-
-
 define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
   %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %result
@@ -274,3 +254,10 @@ declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8>, m
 declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) #1
 declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8>, metadata, metadata) #1
 declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8>, metadata, metadata) #1
+
+attributes #0 = { strictfp }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX8: {{.*}}
+; GFX9: {{.*}}

>From d8f7854928006f708fe73a353aaecfb97e021696 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:38:14 +0900
Subject: [PATCH 13/18] XXX - Strict support for LowerINT_TO_FP32

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8ea6eb71db218..c04733e453bac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3099,6 +3099,8 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
 
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
                                                bool Signed) const {
+  bool IsStrict = Op->getNumValues() == 2;
+
   // The regular method converting a 64-bit integer to float roughly consists of
   // 2 steps: normalization and rounding. In fact, after normalization, the
   // conversion from a 64-bit integer to a float is essentially the same as the
@@ -3126,7 +3128,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   // converted instead followed by negation based its sign bit.
 
   SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
@@ -3202,8 +3204,12 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
                       ShAmt);
   // On GCN, use LDEXP directly.
-  if (Subtarget->isGCN())
+  if (Subtarget->isGCN()) {
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_FLDEXP, SL, MVT::f32, Op.getOperand(0), FVal, ShAmt);
+
     return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
+  }
 
   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
   // part directly to emulate the multiplication of 2^ShAmt. That 8-bit

>From 63a2a831afcfe484a4d212aec473083c641727b9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:46:44 +0900
Subject: [PATCH 14/18] strict sitofp work

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 29 ++++++++++++++-----
 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll     | 26 ++++++++---------
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c04733e453bac..096d418f40b4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -421,7 +421,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
   setOperationAction(
-    {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+    {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
       MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
@@ -3205,8 +3205,10 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
                       ShAmt);
   // On GCN, use LDEXP directly.
   if (Subtarget->isGCN()) {
-    if (IsStrict)
-      return DAG.getNode(ISD::STRICT_FLDEXP, SL, MVT::f32, Op.getOperand(0), FVal, ShAmt);
+    if (IsStrict) {
+      return DAG.getNode(ISD::STRICT_FLDEXP, SL, Op->getVTList(),
+                         Op.getOperand(0), FVal, ShAmt);
+    }
 
     return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
   }
@@ -3270,7 +3272,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
     SDLoc DL(Op);
 
-    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, {MVT::f32, MVT::Other},
+                                    Src);
     SDValue FPRoundFlag =
         DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
     SDValue FPRound =
@@ -3311,13 +3314,23 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
     SDLoc DL(Op);
-    SDValue Src = Op.getOperand(0);
+    SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
 
-    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
     SDValue FPRoundFlag =
-        DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
+      DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+
+    if (IsStrict) {
+      SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, DAG.getVTList(MVT::f32, MVT::Other),
+                                      Op.getOperand(0), Src);
+      SDValue FPRound =
+        DAG.getNode(ISD::STRICT_FP_ROUND, DL, Op->getVTList(), IntToFp32.getValue(1),
+                    IntToFp32, FPRoundFlag);
+      return FPRound;
+    }
+
+    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
     SDValue FPRound =
-        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+      DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
 
     return FPRound;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index a4138f8715851..fb4080db87b03 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -141,19 +141,19 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
   ret <2 x float> %result
 }
 
-define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
-  %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  ret half %result
-}
-
-define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
-  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  ret <2 x half> %result
-}
-
-define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
-  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-  ret <3 x half> %result
+; define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+;   %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+;   ret half %result
+; }
+
+; define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+;   %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+;   ret <2 x half> %result
+; }
+
+; define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+;   %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+;   ret <3 x half> %result
 }
 
 define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {

>From 877493289d1c213497f74848155564614084ed1d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:55:07 +0900
Subject: [PATCH 15/18] Bring back test

---
 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index fb4080db87b03..a4138f8715851 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -141,19 +141,19 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
   ret <2 x float> %result
 }
 
-; define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
-;   %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-;   ret half %result
-; }
-
-; define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
-;   %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-;   ret <2 x half> %result
-; }
-
-; define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
-;   %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-;   ret <3 x half> %result
+define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+  %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+  %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <3 x half> %result
 }
 
 define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {

>From af5634944b121b63ce21ee9be982b0facf130a0a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:58:50 +0900
Subject: [PATCH 16/18] So much duplication

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 096d418f40b4b..7d33421e11fbb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3234,11 +3234,27 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
                                                bool Signed) const {
   SDLoc SL(Op);
-  SDValue Src = Op.getOperand(0);
+  bool IsStrict = Op->getNumValues() == 2;
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
 
+  if (IsStrict) {
+    SDVTList VTs = Op->getVTList();
+    SDValue CvtHi = DAG.getNode(Signed ? ISD::STRICT_SINT_TO_FP : ISD::STRICT_UINT_TO_FP,
+                                SL, VTs, Op.getOperand(0), Hi);
+
+    SDValue CvtLo = DAG.getNode(ISD::STRICT_UINT_TO_FP, SL, VTs, CvtHi.getValue(1), Lo);
+
+    SDValue LdExp = DAG.getNode(ISD::STRICT_FLDEXP, SL, VTs,
+                                CvtLo.getValue(1),
+                                CvtHi,
+                                DAG.getConstant(32, SL, MVT::i32));
+    // TODO: Should this propagate fast-math-flags?
+    return DAG.getNode(ISD::STRICT_FADD, SL, VTs, LdExp.getValue(1), LdExp, CvtLo);
+  }
+
   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
                               SL, MVT::f64, Hi);
 

>From 12533529b376bd26b9024f211d991a2b2dd94cca Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 18:08:11 +0900
Subject: [PATCH 17/18] StrictXINT_TO_FP

---
 .../SelectionDAG/LegalizeFloatTypes.cpp        | 18 ++++++++++++++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h  |  1 +
 2 files changed, 19 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 12956cf874751..c7d00cd74c35e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2444,6 +2444,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
 
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
+    case ISD::STRICT_SINT_TO_FP: R = PromoteFloatRes_STRICT_XINT_TO_FP(N); break;
     case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::VECREDUCE_FADD:
@@ -2719,6 +2720,23 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
                   DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)));
 }
 
+// Construct a SDNode that transforms the SINT or UINT operand to the promoted
+// float type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDVTList NVTs = DAG.getVTList(NVT, MVT::Other);
+
+  SDValue NV = DAG.getNode(N->getOpcode(), DL, NVTs, N->getOperand(0), N->getOperand(1));
+
+  // Round the value to the desired precision (that of the source type).
+  SDValue Rounded = DAG.getNode(ISD::STRICT_FP_ROUND, DL, N->getVTList(), NV.getValue(1), NV,
+                                DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+  return DAG.getNode(
+    ISD::STRICT_FP_EXTEND, DL, NVTs, Rounded.getValue(1), Rounded.getValue(0));
+}
+
 SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
   return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
                                                N->getValueType(0)));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9361e7fff2190..26c92c9e927bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -707,6 +707,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatRes_UNDEF(SDNode *N);
   SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
   SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
+  SDValue PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N);
   SDValue PromoteFloatRes_VECREDUCE(SDNode *N);
   SDValue PromoteFloatRes_VECREDUCE_SEQ(SDNode *N);
 

>From a04b09e3aa2bffa5b20012b64953ef884d68437e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 6 Dec 2023 11:21:27 +0700
Subject: [PATCH 18/18] Address comments

---
 .../SelectionDAG/LegalizeFloatTypes.cpp        | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index c7d00cd74c35e..3b8fadf41be66 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2187,14 +2187,18 @@ static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
 }
 
 static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f16) {
+  if (OpVT == MVT::f16)
     return ISD::STRICT_FP16_TO_FP;
-  } else if (RetVT == MVT::f16) {
+
+  if (RetVT == MVT::f16)
     return ISD::STRICT_FP_TO_FP16;
-  } else if (OpVT == MVT::bf16) {
-    // return ISD::STRICT_BF16_TO_FP;
-  } else if (RetVT == MVT::bf16) {
-    // return ISD::STRICT_FP_TO_BF16;
+
+  if (OpVT == MVT::bf16) {
+    // TODO: return ISD::STRICT_BF16_TO_FP;
+  }
+
+  if (RetVT == MVT::bf16) {
+    // TODO: return ISD::STRICT_FP_TO_BF16;
   }
 
   report_fatal_error("Attempt at an invalid promotion-related conversion");
@@ -2300,7 +2304,7 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
 
 SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
                                                           unsigned OpNo) {
-  assert(OpNo == 1);
+  assert(OpNo == 1 && "Promoting unpromotable operand");
 
   SDValue Op = GetPromotedFloat(N->getOperand(1));
   EVT VT = N->getValueType(0);