[llvm] [AMDGPU] Fix canonicalization of truncated values. (PR #83054)
Harald van Dijk via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 27 15:21:39 PST 2024
https://github.com/hvdijk updated https://github.com/llvm/llvm-project/pull/83054
>From c49617615b2c157a1006a142923fff4de45ede34 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Tue, 27 Feb 2024 23:20:42 +0000
Subject: [PATCH] [AMDGPU] Fix canonicalization of truncated values.
We were relying on roundings to implicitly canonicalize, which is
generally safe, except with roundings that may be optimized away.
Fixes #82937.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 36 +++++++++++++++++
.../SelectionDAG/LegalizeFloatTypes.cpp | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 39 ++++++++++++-------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 5 +++
llvm/test/CodeGen/AMDGPU/bf16.ll | 4 ++
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 29 ++++++++++++++
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 3 +-
7 files changed, 103 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c4d49adc21c4b7b..13de098720edf46 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26258,6 +26258,24 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
if (N0->getOpcode() == ISD::FP16_TO_FP)
return N0->getOperand(0);
+ // fold (fp_to_fp16 (freeze (fp16_to_fp (fp_to_fp16 op))))
+ // -> (fp_to_fp16 (freeze op))
+ if (N0->getOpcode() == ISD::FREEZE) {
+ if (auto fp16_to_fp = N0->getOperand(0);
+ fp16_to_fp->getOpcode() == ISD::FP16_TO_FP) {
+ if (auto new_fp16_to_fp = visitFP16_TO_FP(fp16_to_fp.getNode()))
+ if (new_fp16_to_fp->getOpcode() == ISD::FP16_TO_FP)
+ fp16_to_fp = new_fp16_to_fp;
+ if (auto fp_to_fp16 = fp16_to_fp->getOperand(0);
+ fp_to_fp16->getOpcode() == ISD::FP_TO_FP16) {
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(N0->getOpcode(), SDLoc(N0),
+ N0.getValueType(),
+ fp_to_fp16->getOperand(0)));
+ }
+ }
+ }
+
return SDValue();
}
@@ -26286,6 +26304,24 @@ SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
if (N0->getOpcode() == ISD::BF16_TO_FP)
return N0->getOperand(0);
+ // fold (fp_to_bf16 (freeze (fp16_to_fp (fp_to_bf16 op))))
+ // -> (fp_to_bf16 (freeze op))
+ if (N0->getOpcode() == ISD::FREEZE) {
+ if (auto bf16_to_fp = N0->getOperand(0);
+ bf16_to_fp->getOpcode() == ISD::BF16_TO_FP) {
+ if (auto new_bf16_to_fp = visitBF16_TO_FP(bf16_to_fp.getNode()))
+ if (new_bf16_to_fp->getOpcode() == ISD::BF16_TO_FP)
+ bf16_to_fp = new_bf16_to_fp;
+ if (auto fp_to_bf16 = bf16_to_fp->getOperand(0);
+ fp_to_bf16->getOpcode() == ISD::FP_TO_BF16) {
+ return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+ DAG.getNode(N0->getOpcode(), SDLoc(N0),
+ N0.getValueType(),
+ fp_to_bf16->getOperand(0)));
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f0a04589fbfdc27..315053ce1ac0f00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2398,6 +2398,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::FCOPYSIGN: R = PromoteFloatRes_FCOPYSIGN(N); break;
// Unary FP Operations
+ case ISD::FREEZE:
case ISD::FABS:
case ISD::FCBRT:
case ISD::FCEIL:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab95635..1bc1b4e1c0a4ab2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12416,7 +12416,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
}
bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
- unsigned MaxDepth) const {
+ bool &Trunc, unsigned MaxDepth) const {
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::FCANONICALIZE)
return true;
@@ -12450,7 +12450,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FSQRT:
case ISD::FDIV:
case ISD::FREM:
- case ISD::FP_ROUND:
case ISD::FP_EXTEND:
case ISD::FLDEXP:
case AMDGPUISD::FMUL_LEGACY:
@@ -12473,12 +12472,22 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case AMDGPUISD::CVT_F32_UBYTE3:
return true;
+ case ISD::FP_ROUND:
+ if (Op.getConstantOperandVal(1))
+ Trunc = true;
+ return true;
+
+ case ISD::FREEZE:
+ // FREEZE is used as an optimization barrier; we can ignore any TRUNC in its
+ // input.
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+
// It can/will be lowered or combined as a bit operation.
// Need to check their input recursively to handle.
case ISD::FNEG:
case ISD::FABS:
case ISD::FCOPYSIGN:
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
case ISD::FSIN:
case ISD::FCOS:
@@ -12513,20 +12522,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// FIXME: Does this apply with clamp? It's implemented with max.
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
- if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+ if (!isCanonicalized(DAG, Op.getOperand(I), Trunc, MaxDepth - 1))
return false;
}
return true;
}
case ISD::SELECT: {
- return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
- isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
+ return isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(2), Trunc, MaxDepth - 1);
}
case ISD::BUILD_VECTOR: {
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
SDValue SrcOp = Op.getOperand(i);
- if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
+ if (!isCanonicalized(DAG, SrcOp, Trunc, MaxDepth - 1))
return false;
}
@@ -12534,18 +12543,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
}
case ISD::EXTRACT_VECTOR_ELT:
case ISD::EXTRACT_SUBVECTOR: {
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
}
case ISD::INSERT_VECTOR_ELT: {
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
- isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+ return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1);
}
case ISD::UNDEF:
// Could be anything.
return false;
case ISD::BITCAST:
- return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
if (Op.getValueType() == MVT::i16) {
@@ -12553,7 +12562,8 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
- return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+ return isCanonicalized(DAG, TruncSrc.getOperand(0), Trunc,
+ MaxDepth - 1);
}
}
return false;
@@ -12831,7 +12841,10 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}
- return isCanonicalized(DAG, N0) ? N0 : SDValue();
+ bool Trunc = false;
+ return isCanonicalized(DAG, N0, Trunc)
+ ? Trunc ? DAG.getNode(ISD::FREEZE, SDLoc(N), VT, N0) : N0
+ : SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f6e1d198f40aec6..f89a4d805d59b46 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -516,6 +516,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
Register N1) const override;
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ unsigned MaxDepth = 5) const {
+ bool Trunc;
+ return isCanonicalized(DAG, Op, Trunc, MaxDepth);
+ }
+ bool isCanonicalized(SelectionDAG &DAG, SDValue Op, bool &Trunc,
unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, MachineFunction &MF,
unsigned MaxDepth = 5) const;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c13c4af7b9..161c32325f8c336 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -26818,11 +26818,15 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 274621307f540d4..f53adf88454bc68 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -170,6 +170,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
ret void
}
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %canonicalized = call half @llvm.canonicalize.f16(half %x)
+ ret half %canonicalized
+}
+
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78fb89c71e2e6a4..88110df5c4469f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3980,7 +3980,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; SI-LABEL: v_fneg_canonicalize_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_canonicalize_f16:
More information about the llvm-commits
mailing list