[llvm] DAG: Implement promotion for strict_fp_round (PR #74332)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 5 20:36:50 PST 2023
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/74332
>From 405b8705782e50fd72bfd868f51df3111fe9f095 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 18:07:50 +0900
Subject: [PATCH 01/15] DAG: Implement promotion for strict_fpextend
Test is a placeholder, will be merged into the existing
test after additional bug fixes for illegal f16 targets are fixed.
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 23 ++++++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll | 43 +++++++++++++++++++
3 files changed, 67 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 630aa4a07d7b9..f77b3afccfb8f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2214,6 +2214,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
case ISD::FP_TO_UINT_SAT:
R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
case ISD::FP_EXTEND: R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+ case ISD::STRICT_FP_EXTEND:
+ R = PromoteFloatOp_STRICT_FP_EXTEND(N, OpNo);
+ break;
case ISD::SELECT_CC: R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
case ISD::SETCC: R = PromoteFloatOp_SETCC(N, OpNo); break;
case ISD::STORE: R = PromoteFloatOp_STORE(N, OpNo); break;
@@ -2276,6 +2279,26 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
}
+SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
+ unsigned OpNo) {
+ assert(OpNo == 1);
+
+ SDValue Op = GetPromotedFloat(N->getOperand(1));
+ EVT VT = N->getValueType(0);
+
+ // Desired VT is same as promoted type. Use promoted float directly.
+ if (VT == Op->getValueType(0)) {
+ ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+ return Op;
+ }
+
+ // Else, extend the promoted float value to the desired VT.
+ SDValue Res = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), N->getVTList(),
+ N->getOperand(0), Op);
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ return Res;
+}
+
// Promote the float operands used for comparison. The true- and false-
// operands have the same type as the result and are promoted, if needed, by
// PromoteFloatRes_SELECT_CC
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e9bd54089d062..4c7ddd4aea9e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -712,6 +712,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+ SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
new file mode 100644
index 0000000000000..0339fca4d56cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+
+define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %val = load half, ptr addrspace(1) %ptr
+ %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
+ ret float %result
+}
+
+define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %val = load <2 x half>, ptr addrspace(1) %ptr
+ %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
+
+attributes #0 = { strictfp }
>From f381fe1f2b6586e4e2bb23eeac5b16bd5622e385 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Dec 2023 17:20:26 +0700
Subject: [PATCH 02/15] DAG: Implement promotion for strict_fp_round
Needs an AMDGPU hack to get the selection to work. The ordinary
variant is custom lowered through an almost equivalent target node
that would need a strict variant for additional known bits optimizations.
---
.../include/llvm/Target/TargetSelectionDAG.td | 13 ++++
.../SelectionDAG/LegalizeFloatTypes.cpp | 40 +++++++++++
.../SelectionDAG/LegalizeIntegerTypes.cpp | 15 ++++-
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 9 ++-
llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll | 67 +++++++++++++++++++
6 files changed, 144 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 798e6a1d9525e..de7bf26868b34 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -614,6 +614,12 @@ def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP",
SDTIntToFPOp, [SDNPHasChain]>;
def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP",
SDTIntToFPOp, [SDNPHasChain]>;
+
+def strict_f16_to_fp : SDNode<"ISD::STRICT_FP16_TO_FP",
+ SDTIntToFPOp, [SDNPHasChain]>;
+def strict_fp_to_f16 : SDNode<"ISD::STRICT_FP_TO_FP16",
+ SDTFPToIntOp, [SDNPHasChain]>;
+
def strict_fsetcc : SDNode<"ISD::STRICT_FSETCC", SDTSetCC, [SDNPHasChain]>;
def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>;
@@ -1558,6 +1564,13 @@ def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred),
[(strict_fsetccs node:$lhs, node:$rhs, node:$pred),
(setcc node:$lhs, node:$rhs, node:$pred)]>;
+def any_f16_to_fp : PatFrags<(ops node:$src),
+ [(f16_to_fp node:$src),
+ (strict_f16_to_fp node:$src)]>;
+def any_fp_to_f16 : PatFrags<(ops node:$src),
+ [(fp_to_f16 node:$src),
+ (strict_fp_to_f16 node:$src)]>;
+
multiclass binary_atomic_op_ord {
def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$val)> {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f77b3afccfb8f..d7a688511b726 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2181,6 +2181,20 @@ static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
report_fatal_error("Attempt at an invalid promotion-related conversion");
}
+static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) {
+ if (OpVT == MVT::f16) {
+ return ISD::STRICT_FP16_TO_FP;
+ } else if (RetVT == MVT::f16) {
+ return ISD::STRICT_FP_TO_FP16;
+ } else if (OpVT == MVT::bf16) {
+ // return ISD::STRICT_BF16_TO_FP;
+ } else if (RetVT == MVT::bf16) {
+ // return ISD::STRICT_FP_TO_BF16;
+ }
+
+ report_fatal_error("Attempt at an invalid promotion-related conversion");
+}
+
bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
LLVM_DEBUG(dbgs() << "Promote float operand " << OpNo << ": "; N->dump(&DAG));
SDValue R = SDValue();
@@ -2416,6 +2430,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FFREXP: R = PromoteFloatRes_FFREXP(N); break;
case ISD::FP_ROUND: R = PromoteFloatRes_FP_ROUND(N); break;
+ case ISD::STRICT_FP_ROUND:
+ R = PromoteFloatRes_STRICT_FP_ROUND(N);
+ break;
case ISD::LOAD: R = PromoteFloatRes_LOAD(N); break;
case ISD::SELECT: R = PromoteFloatRes_SELECT(N); break;
case ISD::SELECT_CC: R = PromoteFloatRes_SELECT_CC(N); break;
@@ -2621,6 +2638,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
}
+// Explicit operation to reduce precision. Reduce the value to half precision
+// and promote it back to the legal type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_FP_ROUND(SDNode *N) {
+ SDLoc DL(N);
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Op = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ EVT OpVT = Op->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+ // Round promoted float to desired precision
+ SDValue Round = DAG.getNode(GetPromotionOpcodeStrict(OpVT, VT), DL,
+ DAG.getVTList(IVT, MVT::Other), Chain, Op);
+ // Promote it back to the legal output type
+ SDValue Res =
+ DAG.getNode(GetPromotionOpcodeStrict(VT, NVT), DL,
+ DAG.getVTList(NVT, MVT::Other), Round.getValue(1), Round);
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ return Res;
+}
+
SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
LoadSDNode *L = cast<LoadSDNode>(N);
EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 54698edce7d6f..887670fb6baff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -165,7 +165,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::FP_TO_FP16:
Res = PromoteIntRes_FP_TO_FP16_BF16(N);
break;
-
+ case ISD::STRICT_FP_TO_FP16:
+ Res = PromoteIntRes_STRICT_FP_TO_FP16_BF16(N);
+ break;
case ISD::GET_ROUNDING: Res = PromoteIntRes_GET_ROUNDING(N); break;
case ISD::AND:
@@ -787,6 +789,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16_BF16(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
}
+SDValue DAGTypeLegalizer::PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N) {
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ SDLoc dl(N);
+
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other),
+ N->getOperand(0), N->getOperand(1));
+ ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+ return Res;
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_XRINT(SDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDLoc dl(N);
@@ -1804,6 +1816,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::FP16_TO_FP:
case ISD::VP_UINT_TO_FP:
case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break;
+ case ISD::STRICT_FP16_TO_FP:
case ISD::STRICT_UINT_TO_FP: Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break;
case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4c7ddd4aea9e6..9361e7fff2190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -326,6 +326,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
SDValue PromoteIntRes_FP_TO_FP16_BF16(SDNode *N);
+ SDValue PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N);
SDValue PromoteIntRes_XRINT(SDNode *N);
SDValue PromoteIntRes_FREEZE(SDNode *N);
SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
@@ -698,6 +699,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteFloatRes_ExpOp(SDNode *N);
SDValue PromoteFloatRes_FFREXP(SDNode *N);
SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
+ SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
SDValue PromoteFloatRes_LOAD(SDNode *N);
SDValue PromoteFloatRes_SELECT(SDNode *N);
SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b..d23d6b4f93da8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1097,7 +1097,7 @@ def : Pat <
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
// f16_to_fp patterns
def : GCNPat <
- (f32 (f16_to_fp i32:$src0)),
+ (f32 (any_f16_to_fp i32:$src0)),
(cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)
>;
@@ -1151,6 +1151,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(f16 (uint_to_fp i32:$src)),
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;
+
+ // This is only used on targets without half support
+ // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
+ def : GCNPat <
+ (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
+ >;
}
let SubtargetPredicate = NotHasTrue16BitInsts in
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
index 0339fca4d56cf..8a3647a9b6e93 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -3,6 +3,9 @@
declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0
+declare float @llvm.fabs.f32(float)
define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
@@ -40,4 +43,68 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0
ret <2 x float> %result
}
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret void
+}
+
+define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ store <2 x half> %result, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %neg.arg = fneg float %arg
+ %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret void
+}
+
+define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs(float %arg, ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e64 v0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %abs.arg = call float @llvm.fabs.f32(float %arg)
+ %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret void
+}
+
attributes #0 = { strictfp }
>From 029aba53be9fbd91d5bc79c5e59098efb40a325c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 20:49:51 +0700
Subject: [PATCH 03/15] DAG: Fix ABI lowering with FP promote in strictfp
functions
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 59 ++++++++++++-------
llvm/test/CodeGen/AMDGPU/strict_fpext.ll | 2 +-
2 files changed, 38 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index da7d9ace4114a..29684d3372bdb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -153,6 +153,7 @@ static const unsigned MaxParallelChains = 64;
static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, const Value *V,
+ SDValue InChain,
std::optional<CallingConv::ID> CC);
/// getCopyFromParts - Create a value that contains the specified legal parts
@@ -163,6 +164,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
static SDValue
getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V,
+ SDValue InChain,
std::optional<CallingConv::ID> CC = std::nullopt,
std::optional<ISD::NodeType> AssertOp = std::nullopt) {
// Let the target assemble the parts if it wants to
@@ -173,7 +175,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
if (ValueVT.isVector())
return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
- CC);
+ InChain, CC);
assert(NumParts > 0 && "No parts to assemble!");
SDValue Val = Parts[0];
@@ -194,10 +196,10 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
if (RoundParts > 2) {
- Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
- PartVT, HalfVT, V);
- Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
- RoundParts / 2, PartVT, HalfVT, V);
+ Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, PartVT, HalfVT, V,
+ InChain);
+ Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, RoundParts / 2,
+ PartVT, HalfVT, V, InChain);
} else {
Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
@@ -213,7 +215,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
unsigned OddParts = NumParts - RoundParts;
EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
- OddVT, V, CC);
+ OddVT, V, InChain, CC);
// Combine the round and odd parts.
Lo = Val;
@@ -243,7 +245,8 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
!PartVT.isVector() && "Unexpected split");
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
- Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
+ Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V,
+ InChain, CC);
}
}
@@ -283,10 +286,20 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
// FP_ROUND's are always exact here.
- if (ValueVT.bitsLT(Val.getValueType()))
- return DAG.getNode(
- ISD::FP_ROUND, DL, ValueVT, Val,
- DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
+ if (ValueVT.bitsLT(Val.getValueType())) {
+
+ SDValue NoChange =
+ DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+
+ if (DAG.getMachineFunction().getFunction().getAttributes().hasFnAttr(
+ llvm::Attribute::StrictFP)) {
+ return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
+ DAG.getVTList(ValueVT, MVT::Other), InChain, Val,
+ NoChange);
+ }
+
+ return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val, NoChange);
+ }
return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
}
@@ -324,6 +337,7 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, const Value *V,
+ SDValue InChain,
std::optional<CallingConv::ID> CallConv) {
assert(ValueVT.isVector() && "Not a vector value");
assert(NumParts > 0 && "No parts to assemble!");
@@ -362,8 +376,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
// If the register was not expanded, truncate or copy the value,
// as appropriate.
for (unsigned i = 0; i != NumParts; ++i)
- Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
- PartVT, IntermediateVT, V, CallConv);
+ Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, PartVT, IntermediateVT,
+ V, InChain, CallConv);
} else if (NumParts > 0) {
// If the intermediate type was expanded, build the intermediate
// operands from the parts.
@@ -371,8 +385,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
"Must expand into a divisible number of parts!");
unsigned Factor = NumParts / NumIntermediates;
for (unsigned i = 0; i != NumIntermediates; ++i)
- Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
- PartVT, IntermediateVT, V, CallConv);
+ Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, PartVT,
+ IntermediateVT, V, InChain, CallConv);
}
// Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
@@ -926,7 +940,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
}
Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
- RegisterVT, ValueVT, V, CallConv);
+ RegisterVT, ValueVT, V, Chain, CallConv);
Part += NumRegs;
Parts.clear();
}
@@ -10641,9 +10655,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
- ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
- NumRegs, RegisterVT, VT, nullptr,
- CLI.CallConv, AssertOp));
+ ReturnValues.push_back(getCopyFromParts(
+ CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr,
+ CLI.Chain, CLI.CallConv, AssertOp));
CurReg += NumRegs;
}
@@ -11122,8 +11136,9 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
MVT VT = ValueVTs[0].getSimpleVT();
MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
std::optional<ISD::NodeType> AssertOp;
- SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
- nullptr, F.getCallingConv(), AssertOp);
+ SDValue ArgValue =
+ getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, nullptr, NewRoot,
+ F.getCallingConv(), AssertOp);
MachineFunction& MF = SDB->DAG.getMachineFunction();
MachineRegisterInfo& RegInfo = MF.getRegInfo();
@@ -11195,7 +11210,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
AssertOp = ISD::AssertZext;
ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
- PartVT, VT, nullptr,
+ PartVT, VT, nullptr, NewRoot,
F.getCallingConv(), AssertOp));
}
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 22bebb7ad26f5..950c2effcbeb7 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; FIXME: Missing operand promote for f16
-; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
>From ce9dded857d3433575044b26fd458a95a7a5e275 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 4 Dec 2023 18:16:20 +0700
Subject: [PATCH 04/15] AMDGPU: Improve handling of strictfp fp_to_fp16
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 8 ++++----
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d23d6b4f93da8..4e01f17349c2c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1102,22 +1102,22 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
>;
def : GCNPat <
- (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+ (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
(cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0)
>;
def : GCNPat <
- (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+ (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
(cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
>;
def : GCNPat <
- (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+ (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0)
>;
def : GCNPat <
- (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+ (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 53b0513c85d88..f398d91b59091 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -509,7 +509,7 @@ defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
def : GCNPat<
- (f32 (f16_to_fp i16:$src)),
+ (f32 (any_f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : GCNPat<
@@ -519,7 +519,7 @@ def : GCNPat<
}
let OtherPredicates = [HasTrue16BitInsts] in {
def : GCNPat<
- (f32 (f16_to_fp i16:$src)),
+ (f32 (any_f16_to_fp i16:$src)),
(V_CVT_F32_F16_t16_e32 $src)
>;
def : GCNPat<
>From 74acdfcf4355aff3f39cebca06f01da1a68e4802 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 25 Jun 2023 19:28:18 -0400
Subject: [PATCH 05/15] XXX - AMDGPU: Handle strict sitofp
---
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 8 +-
llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 258 +++++++++++++++++++++
2 files changed, 262 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f398d91b59091..72d8f97991729 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -262,7 +262,7 @@ let SchedRW = [WriteDoubleCvt] in {
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>;
let mayRaiseFPException = 0 in {
-defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, any_sint_to_fp>;
}
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
@@ -281,7 +281,7 @@ let SchedRW = [WriteFloatCvt] in {
// XXX: Does this really not raise exceptions? The manual claims the
// 16-bit ones can.
let mayRaiseFPException = 0 in {
-defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, any_sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
}
@@ -467,11 +467,11 @@ let SubtargetPredicate = isGFX7Plus in {
let FPDPRounding = 1 in {
let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, any_sint_to_fp>;
}
let OtherPredicates = [HasTrue16BitInsts] in {
defm V_CVT_F16_U16_t16 : VOP1Inst <"v_cvt_f16_u16_t16", VOP1_F16_I16_t16, uint_to_fp>;
-defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, sint_to_fp>;
+defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, any_sint_to_fp>;
}
} // End FPDPRounding = 1
// OMod clears exceptions when set in these two instructions
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
new file mode 100644
index 0000000000000..bb79071c48021
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; FIXME: Missing operand promote for f16
+; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+
+define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i16_to_v2f16_fpexcept_strict(<2 x i16> %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i16_to_v3f16_fpexcept_strict(<3 x i16> %arg) #0 {
+ %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i16_to_f32_fpexcept_strict(i16 %arg) #0 {
+ %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i16_to_v2f32_fpexcept_strict(<2 x i16> %arg) #0 {
+ %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i16_to_v3f32_fpexcept_strict(<3 x i16> %arg) #0 {
+ %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i16_to_f64_fpexcept_strict(i16 %arg) #0 {
+ %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i16_to_v2f64_fpexcept_strict(<2 x i16> %arg) #0 {
+ %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i16_to_v3f64_fpexcept_strict(<3 x i16> %arg) #0 {
+ %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i16(i16, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16>, metadata, metadata) #1
+
+
+
+
+
+
+define half @v_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i32_to_v3f16_fpexcept_strict(<3 x i32> %arg) #0 {
+ %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
+ %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
+ %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i32_to_v3f32_fpexcept_strict(<3 x i32> %arg) #0 {
+ %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i32_to_f64_fpexcept_strict(i32 %arg) #0 {
+ %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i32_to_v2f64_fpexcept_strict(<2 x i32> %arg) #0 {
+ %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i32_to_v3f64_fpexcept_strict(<3 x i32> %arg) #0 {
+ %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) #1
+
+
+
+
+
+
+attributes #0 = { strictfp }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX8: {{.*}}
+; GFX9: {{.*}}
+
+
+
+
+
+
+
+define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+ %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {
+ %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i64_to_v2f32_fpexcept_strict(<2 x i64> %arg) #0 {
+ %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i64_to_v3f32_fpexcept_strict(<3 x i64> %arg) #0 {
+ %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i64_to_f64_fpexcept_strict(i64 %arg) #0 {
+ %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i64_to_v2f64_fpexcept_strict(<2 x i64> %arg) #0 {
+ %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i64_to_v3f64_fpexcept_strict(<3 x i64> %arg) #0 {
+ %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) #1
+
+define half @v_constrained_sitofp_i8_to_f16_fpexcept_strict(i8 %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i8_to_v2f16_fpexcept_strict(<2 x i8> %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i8_to_v3f16_fpexcept_strict(<3 x i8> %arg) #0 {
+ %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x half> %result
+}
+
+define float @v_constrained_sitofp_i8_to_f32_fpexcept_strict(i8 %arg) #0 {
+ %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret float %result
+}
+
+define <2 x float> @v_constrained_sitofp_v2i8_to_v2f32_fpexcept_strict(<2 x i8> %arg) #0 {
+ %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
+
+define <3 x float> @v_constrained_sitofp_v3i8_to_v3f32_fpexcept_strict(<3 x i8> %arg) #0 {
+ %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x float> %result
+}
+
+define double @v_constrained_sitofp_i8_to_f64_fpexcept_strict(i8 %arg) #0 {
+ %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret double %result
+}
+
+define <2 x double> @v_constrained_sitofp_v2i8_to_v2f64_fpexcept_strict(<2 x i8> %arg) #0 {
+ %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x double> %result
+}
+
+define <3 x double> @v_constrained_sitofp_v3i8_to_v3f64_fpexcept_strict(<3 x i8> %arg) #0 {
+ %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x double> %result
+}
+
+declare half @llvm.experimental.constrained.sitofp.f16.i8(i8, metadata, metadata) #1
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8>, metadata, metadata) #1
+
+declare float @llvm.experimental.constrained.sitofp.f32.i8(i8, metadata, metadata) #1
+declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8>, metadata, metadata) #1
+
+declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) #1
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8>, metadata, metadata) #1
+declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8>, metadata, metadata) #1
>From 4169c4ba53bb2a8cd1ccc43a7768e9b590498b6d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 14:59:47 +0900
Subject: [PATCH 06/15] XXX - strict sint to fp
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++++---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1f..3b59ac1156054 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1331,7 +1331,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
return lowerFEXP(Op, DAG);
case ISD::FEXP2:
return lowerFEXP2(Op, DAG);
- case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::FP_TO_SINT:
@@ -3281,8 +3283,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
EVT DestVT = Op.getValueType();
-
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (SrcVT == MVT::i16) {
@@ -3292,6 +3294,8 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
SDLoc DL(Op);
// Promote src to i32
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, Op->getVTList(), Op.getOperand(0), Ext);
return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 53ab5da013539..bdf4b1c6213fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -539,7 +539,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
MVT::f16, Custom);
- setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP}, MVT::i16, Custom);
setOperationAction(
{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
>From 58c774e4341cceac21b2a141397f4c6ffeba0678 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:34 +0900
Subject: [PATCH 07/15] Hacky
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++--
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3b59ac1156054..8ea6eb71db218 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -421,7 +421,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
setOperationAction(
- {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+ {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
@@ -442,7 +442,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
ISD::MULHS, ISD::OR, ISD::SHL,
ISD::SRA, ISD::SRL, ISD::ROTL,
- ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
+ ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP,
ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bdf4b1c6213fa..db329e3cbe3a3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -542,9 +542,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP}, MVT::i16, Custom);
setOperationAction(
- {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+ {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+ ISD::UINT_TO_FP},
MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
+
// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
>From a6e349d915cfb0f285c976c0491700cf84fe4dae Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:48 +0900
Subject: [PATCH 08/15] worky
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +-
llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 24 ++++++++++++++++++++---
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4e01f17349c2c..581c23cede7d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1143,7 +1143,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
>;
def : GCNPat <
- (f16 (sint_to_fp i32:$src)),
+ (f16 (any_sint_to_fp i32:$src)),
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src))
>;
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index bb79071c48021..b61b38b6183c8 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -4,7 +4,10 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1100 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1150 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1150 %s
+
+
define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
%result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -88,7 +91,7 @@ define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
ret float %result
}
-define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(a<2 x i32> %arg) #0 {
%result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret <2 x float> %result
}
@@ -125,10 +128,25 @@ declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, meta
declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) #1
declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) #1
+define half @s_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 inreg %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+define float @s_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 inreg %arg) #0 {
+ %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+define <2 x half> @s_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> inreg %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
-
+define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> inreg %arg) #0 {
+ %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x float> %result
+}
attributes #0 = { strictfp }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
>From a023eedd1cffe54b72648ac3fa4b646c6e74c061 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:30:58 +0900
Subject: [PATCH 09/15] todo
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9e10efd1b07e1..763978178e6d6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -413,7 +413,7 @@ class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
SchedRW = [WriteSFPU], isReMaterializable = 1 in {
- def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>;
+ def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>; // xxx - any
def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>;
let mayRaiseFPException = 1 in {
>From 324c110b502263112cd94b6670ccdd4cf8efd411 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:31:39 +0900
Subject: [PATCH 10/15] test
---
llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 31 +++++++----------------
1 file changed, 9 insertions(+), 22 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index b61b38b6183c8..a4138f8715851 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -7,8 +7,6 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1100 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1150 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1150 %s
-
-
define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 {
%result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret half %result
@@ -66,11 +64,6 @@ declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, meta
declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16>, metadata, metadata) #1
declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16>, metadata, metadata) #1
-
-
-
-
-
define half @v_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 %arg) #0 {
%result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret half %result
@@ -91,7 +84,7 @@ define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 {
ret float %result
}
-define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(a<2 x i32> %arg) #0 {
+define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 {
%result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret <2 x float> %result
}
@@ -135,7 +128,7 @@ define half @s_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 inreg %arg) #0
define float @s_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 inreg %arg) #0 {
%result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret half %result
+ ret float %result
}
define <2 x half> @s_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> inreg %arg) #0 {
@@ -148,19 +141,6 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
ret <2 x float> %result
}
-attributes #0 = { strictfp }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GFX8: {{.*}}
-; GFX9: {{.*}}
-
-
-
-
-
-
-
define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
%result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
ret half %result
@@ -274,3 +254,10 @@ declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8>, m
declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) #1
declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8>, metadata, metadata) #1
declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8>, metadata, metadata) #1
+
+attributes #0 = { strictfp }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX8: {{.*}}
+; GFX9: {{.*}}
>From 2eecafa2318828b5bb8731a4c347e0d7d7494209 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 15:38:14 +0900
Subject: [PATCH 11/15] XXX - Strict support for LowerINT_TO_FP32
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8ea6eb71db218..c04733e453bac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3099,6 +3099,8 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
+ bool IsStrict = Op->getNumValues() == 2;
+
// The regular method converting a 64-bit integer to float roughly consists of
// 2 steps: normalization and rounding. In fact, after normalization, the
// conversion from a 64-bit integer to a float is essentially the same as the
@@ -3126,7 +3128,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
// converted instead followed by negation based its sign bit.
SDLoc SL(Op);
- SDValue Src = Op.getOperand(0);
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(Src, DAG);
@@ -3202,8 +3204,12 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
ShAmt);
// On GCN, use LDEXP directly.
- if (Subtarget->isGCN())
+ if (Subtarget->isGCN()) {
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FLDEXP, SL, MVT::f32, Op.getOperand(0), FVal, ShAmt);
+
return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
+ }
// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
// part directly to emulate the multiplication of 2^ShAmt. That 8-bit
>From 4df02ae14f12e62efc22316eb03fba84a1ed3634 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:46:44 +0900
Subject: [PATCH 12/15] strict sitofp work
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 29 ++++++++++++++-----
llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 26 ++++++++---------
2 files changed, 34 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c04733e453bac..096d418f40b4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -421,7 +421,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
setOperationAction(
- {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
+ {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
@@ -3205,8 +3205,10 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
ShAmt);
// On GCN, use LDEXP directly.
if (Subtarget->isGCN()) {
- if (IsStrict)
- return DAG.getNode(ISD::STRICT_FLDEXP, SL, MVT::f32, Op.getOperand(0), FVal, ShAmt);
+ if (IsStrict) {
+ return DAG.getNode(ISD::STRICT_FLDEXP, SL, Op->getVTList(),
+ Op.getOperand(0), FVal, ShAmt);
+ }
return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
}
@@ -3270,7 +3272,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
SDLoc DL(Op);
- SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, {MVT::f32, MVT::Other},
+ Src);
SDValue FPRoundFlag =
DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
SDValue FPRound =
@@ -3311,13 +3314,23 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
SDLoc DL(Op);
- SDValue Src = Op.getOperand(0);
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
- SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
SDValue FPRoundFlag =
- DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
+ DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+
+ if (IsStrict) {
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, DAG.getVTList(MVT::f32, MVT::Other),
+ Op.getOperand(0), Src);
+ SDValue FPRound =
+ DAG.getNode(ISD::STRICT_FP_ROUND, DL, Op->getVTList(), IntToFp32.getValue(1),
+ IntToFp32, FPRoundFlag);
+ return FPRound;
+ }
+
+ SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
SDValue FPRound =
- DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+ DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
return FPRound;
}
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index a4138f8715851..fb4080db87b03 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -141,19 +141,19 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
ret <2 x float> %result
}
-define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
- %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret half %result
-}
-
-define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
- %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <2 x half> %result
-}
-
-define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
- %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
- ret <3 x half> %result
+; define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+; %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; ret half %result
+; }
+
+; define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+; %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; ret <2 x half> %result
+; }
+
+; define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+; %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; ret <3 x half> %result
}
define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {
>From 6d9dc6813c1856a7dd0f01cf7dcb7b6bbf3bacd9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:55:07 +0900
Subject: [PATCH 13/15] Bring back test
---
llvm/test/CodeGen/AMDGPU/strict_sitofp.ll | 26 +++++++++++------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
index fb4080db87b03..a4138f8715851 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll
@@ -141,19 +141,19 @@ define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32
ret <2 x float> %result
}
-; define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
-; %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-; ret half %result
-; }
-
-; define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
-; %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-; ret <2 x half> %result
-; }
-
-; define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
-; %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
-; ret <3 x half> %result
+define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 {
+ %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret half %result
+}
+
+define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 {
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <2 x half> %result
+}
+
+define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 {
+ %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict")
+ ret <3 x half> %result
}
define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 {
>From 5145c847ce4a0e6893572f76b22d6f8762e72b5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 17:58:50 +0900
Subject: [PATCH 14/15] So much duplication
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 096d418f40b4b..7d33421e11fbb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3234,11 +3234,27 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op->getNumValues() == 2;
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
SDValue Lo, Hi;
std::tie(Lo, Hi) = split64BitValue(Src, DAG);
+ if (IsStrict) {
+ SDVTList VTs = Op->getVTList();
+ SDValue CvtHi = DAG.getNode(Signed ? ISD::STRICT_SINT_TO_FP : ISD::STRICT_UINT_TO_FP,
+ SL, VTs, Op.getOperand(0), Hi);
+
+ SDValue CvtLo = DAG.getNode(ISD::STRICT_UINT_TO_FP, SL, VTs, CvtHi.getValue(1), Lo);
+
+ SDValue LdExp = DAG.getNode(ISD::STRICT_FLDEXP, SL, VTs,
+ CvtLo.getValue(1),
+ CvtHi,
+ DAG.getConstant(32, SL, MVT::i32));
+ // TODO: Should this propagate fast-math-flags?
+ return DAG.getNode(ISD::STRICT_FADD, SL, VTs, LdExp.getValue(1), LdExp, CvtLo);
+ }
+
SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
SL, MVT::f64, Hi);
>From 66316f8c8ffbe72b0aa0433bed2dbcbcb6a8a70d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 2 Dec 2023 18:08:11 +0900
Subject: [PATCH 15/15] StrictXINT_TO_FP
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 18 ++++++++++++++++++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 +
2 files changed, 19 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index d7a688511b726..f7f49c917c1e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2439,6 +2439,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
+ case ISD::STRICT_SINT_TO_FP: R = PromoteFloatRes_STRICT_XINT_TO_FP(N); break;
case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break;
case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
case ISD::VECREDUCE_FADD:
@@ -2714,6 +2715,23 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)));
}
+// Construct a SDNode that transforms the SINT or UINT operand to the promoted
+// float type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ SDVTList NVTs = DAG.getVTList(NVT, MVT::Other);
+
+ SDValue NV = DAG.getNode(N->getOpcode(), DL, NVTs, N->getOperand(0), N->getOperand(1));
+
+ // Round the value to the desired precision (that of the source type).
+ SDValue Rounded = DAG.getNode(ISD::STRICT_FP_ROUND, DL, N->getVTList(), NV.getValue(1), NV,
+ DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+ return DAG.getNode(
+ ISD::STRICT_FP_EXTEND, DL, NVTs, Rounded.getValue(1), Rounded.getValue(0));
+}
+
SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
N->getValueType(0)));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9361e7fff2190..26c92c9e927bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -707,6 +707,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteFloatRes_UNDEF(SDNode *N);
SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
+ SDValue PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N);
SDValue PromoteFloatRes_VECREDUCE(SDNode *N);
SDValue PromoteFloatRes_VECREDUCE_SEQ(SDNode *N);
More information about the llvm-commits
mailing list