[llvm] [DAGCombiner] Remove `UnsafeFPMath` usage in `visitFSUBForFMACombine` (PR #145637)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 22:25:09 PDT 2025
https://github.com/paperchalice updated https://github.com/llvm/llvm-project/pull/145637
>From 01389e1d88e6b43b9478b879456948ad5dc2fca1 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 11:00:24 +0800
Subject: [PATCH 1/3] [DAGCombiner] Remove UnsafeFPMath usage in
visitFSUBForFMACombine
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +-
llvm/test/CodeGen/AMDGPU/fma-combine.ll | 342 ++++++++++--------
.../AMDGPU/fmul-2-combine-multi-use.ll | 10 +-
llvm/test/CodeGen/AMDGPU/mad-combine.ll | 190 +++++++---
llvm/test/CodeGen/PowerPC/fma-combine.ll | 41 +--
5 files changed, 362 insertions(+), 229 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 91f696e8fe88e..9e6e81e2c0dee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17010,8 +17010,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
return SDValue();
const SDNodeFlags Flags = N->getFlags();
- bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath || HasFMAD);
+ bool AllowFusionGlobally =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD);
// If the subtraction is not contractable, do not combine.
if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
@@ -17167,7 +17167,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
}
auto isReassociable = [&Options](SDNode *N) {
- return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+ return N->getFlags().hasAllowReassociation();
};
auto isContractableAndReassociableFMUL = [&isContractableFMUL,
@@ -17181,7 +17181,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// More folding opportunities when target permits.
if (Aggressive && isReassociable(N)) {
- bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
+ bool CanFuse = N->getFlags().hasAllowContract();
// fold (fsub (fma x, y, (fmul u, v)), z)
// -> (fma x, y (fma u, v, (fneg z)))
if (CanFuse && isFusedOp(N0) &&
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index a96d022b66f12..c79cf87712dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,11 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
-
-; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -65,8 +63,8 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
%b = load volatile double, ptr addrspace(1) %gep.1
%c = load volatile double, ptr addrspace(1) %gep.2
- %mul = fmul double %a, %b
- %fma = fadd double %mul, %c
+ %mul = fmul contract double %a, %b
+ %fma = fadd contract double %mul, %c
store double %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -134,9 +132,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
%c = load volatile double, ptr addrspace(1) %gep.2
%d = load volatile double, ptr addrspace(1) %gep.3
- %mul = fmul double %a, %b
- %fma0 = fadd double %mul, %c
- %fma1 = fadd double %mul, %d
+ %mul = fmul contract double %a, %b
+ %fma0 = fadd contract double %mul, %c
+ %fma1 = fadd contract double %mul, %d
store volatile double %fma0, ptr addrspace(1) %gep.out.0
store volatile double %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -190,8 +188,8 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
%b = load volatile double, ptr addrspace(1) %gep.1
%c = load volatile double, ptr addrspace(1) %gep.2
- %mul = fmul double %a, %b
- %fma = fadd double %c, %mul
+ %mul = fmul contract double %a, %b
+ %fma = fadd contract double %c, %mul
store double %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -244,8 +242,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
%b = load volatile double, ptr addrspace(1) %gep.1
%c = load volatile double, ptr addrspace(1) %gep.2
- %mul = fmul double %a, %b
- %fma = fsub double %mul, %c
+ %mul = fmul contract double %a, %b
+ %fma = fsub contract double %mul, %c
store double %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -313,9 +311,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
%c = load volatile double, ptr addrspace(1) %gep.2
%d = load volatile double, ptr addrspace(1) %gep.3
- %mul = fmul double %a, %b
- %fma0 = fsub double %mul, %c
- %fma1 = fsub double %mul, %d
+ %mul = fmul contract double %a, %b
+ %fma0 = fsub contract double %mul, %c
+ %fma1 = fsub contract double %mul, %d
store volatile double %fma0, ptr addrspace(1) %gep.out.0
store volatile double %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -369,8 +367,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
%b = load volatile double, ptr addrspace(1) %gep.1
%c = load volatile double, ptr addrspace(1) %gep.2
- %mul = fmul double %a, %b
- %fma = fsub double %c, %mul
+ %mul = fmul contract double %a, %b
+ %fma = fsub contract double %c, %mul
store double %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -438,9 +436,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
%c = load volatile double, ptr addrspace(1) %gep.2
%d = load volatile double, ptr addrspace(1) %gep.3
- %mul = fmul double %a, %b
- %fma0 = fsub double %c, %mul
- %fma1 = fsub double %d, %mul
+ %mul = fmul contract double %a, %b
+ %fma0 = fsub contract double %c, %mul
+ %fma1 = fsub contract double %d, %mul
store volatile double %fma0, ptr addrspace(1) %gep.out.0
store volatile double %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -494,9 +492,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
%b = load volatile double, ptr addrspace(1) %gep.1
%c = load volatile double, ptr addrspace(1) %gep.2
- %mul = fmul double %a, %b
- %mul.neg = fsub double -0.0, %mul
- %fma = fsub double %mul.neg, %c
+ %mul = fmul contract double %a, %b
+ %mul.neg = fsub contract double -0.0, %mul
+ %fma = fsub contract double %mul.neg, %c
store double %fma, ptr addrspace(1) %gep.out
ret void
@@ -565,10 +563,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
%c = load volatile double, ptr addrspace(1) %gep.2
%d = load volatile double, ptr addrspace(1) %gep.3
- %mul = fmul double %a, %b
- %mul.neg = fsub double -0.0, %mul
- %fma0 = fsub double %mul.neg, %c
- %fma1 = fsub double %mul.neg, %d
+ %mul = fmul contract double %a, %b
+ %mul.neg = fsub contract double -0.0, %mul
+ %fma0 = fsub contract double %mul.neg, %c
+ %fma1 = fsub contract double %mul.neg, %d
store volatile double %fma0, ptr addrspace(1) %gep.out.0
store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -638,10 +636,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
%c = load volatile double, ptr addrspace(1) %gep.2
%d = load volatile double, ptr addrspace(1) %gep.3
- %mul = fmul double %a, %b
- %mul.neg = fsub double -0.0, %mul
- %fma0 = fsub double %mul.neg, %c
- %fma1 = fsub double %mul, %d
+ %mul = fmul contract double %a, %b
+ %mul.neg = fsub contract double -0.0, %mul
+ %fma0 = fsub contract double %mul.neg, %c
+ %fma1 = fsub contract double %mul, %d
store volatile double %fma0, ptr addrspace(1) %gep.out.0
store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -650,32 +648,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; SI-NOFMA: ; %bb.0:
-; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT: s_mov_b32 s6, 0
-; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
-; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
-; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NOFMA-NEXT: s_endpgm
-;
; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -701,30 +673,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -761,18 +709,16 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
%u = load volatile double, ptr addrspace(1) %gep.3
%v = load volatile double, ptr addrspace(1) %gep.4
- %tmp0 = fmul double %u, %v
- %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
- %tmp2 = fsub double %tmp1, %z
+ %tmp0 = fmul contract fast double %u, %v
+ %tmp1 = call contract fast double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+ %tmp2 = fsub contract fast double %tmp1, %z
store double %tmp2, ptr addrspace(1) %gep.out
ret void
}
-; fold (fsub x, (fma y, z, (fmul u, v)))
-; -> (fma (fneg y), z, (fma (fneg u), v, x))
-define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
; SI-NOFMA: ; %bb.0:
; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
@@ -793,11 +739,59 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7]
; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NOFMA-NEXT: s_endpgm
;
+; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: s_endpgm
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+ %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
+ %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
+ %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
+ %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
+ %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
+
+ %x = load volatile double, ptr addrspace(1) %gep.0
+ %y = load volatile double, ptr addrspace(1) %gep.1
+ %z = load volatile double, ptr addrspace(1) %gep.2
+ %u = load volatile double, ptr addrspace(1) %gep.3
+ %v = load volatile double, ptr addrspace(1) %gep.4
+
+ %tmp0 = fmul double %u, %v
+ %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+ %tmp2 = fsub double %tmp1, %z
+
+ store double %tmp2, ptr addrspace(1) %gep.out
+ ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+; -> (fma (fneg y), z, (fma (fneg u), v, x))
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
; SI-FMA: ; %bb.0:
; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -823,30 +817,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-FMA-NEXT: s_endpgm
;
-; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
-; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT: s_endpgm
-;
; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -883,6 +853,78 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
%u = load volatile double, ptr addrspace(1) %gep.3
%v = load volatile double, ptr addrspace(1) %gep.4
+ ; nsz flag is needed since this combine may change sign of zero
+ %tmp0 = fmul contract reassoc nsz double %u, %v
+ %tmp1 = call contract reassoc nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+ %tmp2 = fsub contract reassoc nsz double %x, %tmp1
+
+ store double %tmp2, ptr addrspace(1) %gep.out
+ ret void
+}
+define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; SI-NOFMA: ; %bb.0:
+; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-NOFMA-NEXT: s_mov_b32 s6, 0
+; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NOFMA-NEXT: s_endpgm
+;
+; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; GFX11-NOFMA: ; %bb.0:
+; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: s_endpgm
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+ %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
+ %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
+ %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
+ %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
+ %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
+
+ %x = load volatile double, ptr addrspace(1) %gep.0
+ %y = load volatile double, ptr addrspace(1) %gep.1
+ %z = load volatile double, ptr addrspace(1) %gep.2
+ %u = load volatile double, ptr addrspace(1) %gep.3
+ %v = load volatile double, ptr addrspace(1) %gep.4
+
; nsz flag is needed since this combine may change sign of zero
%tmp0 = fmul nsz double %u, %v
%tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
@@ -979,8 +1021,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
%y = load volatile float, ptr addrspace(1) %in2
- %a = fadd float %x, 1.0
- %m = fmul float %a, %y
+ %a = fadd contract float %x, 1.0
+ %m = fmul contract float %a, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1068,8 +1110,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load volatile float, ptr addrspace(1) %in1
%y = load volatile float, ptr addrspace(1) %in2
- %a = fadd float %x, 1.0
- %m = fmul float %y, %a
+ %a = fadd contract float %x, 1.0
+ %m = fmul contract float %y, %a
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1157,8 +1199,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %a = fadd float %x, -1.0
- %m = fmul float %a, %y
+ %a = fadd contract float %x, -1.0
+ %m = fmul contract float %a, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1246,8 +1288,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %a = fadd float %x, -1.0
- %m = fmul float %y, %a
+ %a = fadd contract float %x, -1.0
+ %m = fmul contract float %y, %a
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1335,8 +1377,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float 1.0, %x
- %m = fmul float %s, %y
+ %s = fsub contract float 1.0, %x
+ %m = fmul contract float %s, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1424,8 +1466,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float 1.0, %x
- %m = fmul float %y, %s
+ %s = fsub contract float 1.0, %x
+ %m = fmul contract float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1513,8 +1555,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float -1.0, %x
- %m = fmul float %s, %y
+ %s = fsub contract float -1.0, %x
+ %m = fmul contract float %s, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1602,8 +1644,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float -1.0, %x
- %m = fmul float %y, %s
+ %s = fsub contract float -1.0, %x
+ %m = fmul contract float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1691,8 +1733,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float %x, 1.0
- %m = fmul float %s, %y
+ %s = fsub contract float %x, 1.0
+ %m = fmul contract float %s, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1780,8 +1822,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float %x, 1.0
- %m = fmul float %y, %s
+ %s = fsub contract float %x, 1.0
+ %m = fmul contract float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1869,8 +1911,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float %x, -1.0
- %m = fmul float %s, %y
+ %s = fsub contract float %x, -1.0
+ %m = fmul contract float %s, %y
store float %m, ptr addrspace(1) %out
ret void
}
@@ -1958,8 +2000,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
ptr addrspace(1) %in2) {
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
- %s = fsub float %x, -1.0
- %m = fmul float %y, %s
+ %s = fsub contract float %x, -1.0
+ %m = fmul contract float %y, %s
store float %m, ptr addrspace(1) %out
ret void
}
@@ -2072,10 +2114,10 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
%x = load float, ptr addrspace(1) %in1
%y = load float, ptr addrspace(1) %in2
%t = load float, ptr addrspace(1) %in3
- %t1 = fsub float 1.0, %t
- %tx = fmul float %x, %t
- %ty = fmul float %y, %t1
- %r = fadd float %tx, %ty
+ %t1 = fsub contract float 1.0, %t
+ %tx = fmul contract float %x, %t
+ %ty = fmul contract float %y, %t1
+ %r = fadd contract float %tx, %ty
store float %r, ptr addrspace(1) %out
ret void
}
@@ -2152,10 +2194,10 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
%x = load double, ptr addrspace(1) %in1
%y = load double, ptr addrspace(1) %in2
%t = load double, ptr addrspace(1) %in3
- %t1 = fsub double 1.0, %t
- %tx = fmul double %x, %t
- %ty = fmul double %y, %t1
- %r = fadd double %tx, %ty
+ %t1 = fsub contract double 1.0, %t
+ %tx = fmul contract double %x, %t
+ %ty = fmul contract double %y, %t1
+ %r = fadd contract double %tx, %ty
store double %r, ptr addrspace(1) %out
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 7c0d3692242a4..d4471c85c467c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -73,8 +73,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
%a16 = select i1 %a15, float %a12, float %a14
%a17 = fmul float %a16, 2.0
%a18 = fmul float %a17, %a17
- %a19 = fmul float %a18, %a17
- %a20 = fsub float 1.0, %a19
+ %a19 = fmul contract float %a18, %a17
+ %a20 = fsub contract float 1.0, %a19
store float %a20, ptr addrspace(1) %out
ret void
}
@@ -540,8 +540,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
%a16 = select i1 %a15, half %a12, half %a14
%a17 = fmul half %a16, 2.0
%a18 = fmul half %a17, %a17
- %a19 = fmul half %a18, %a17
- %a20 = fsub half 1.0, %a19
+ %a19 = fmul contract half %a18, %a17
+ %a20 = fsub contract half 1.0, %a19
store half %a20, ptr addrspace(1) %out
ret void
}
@@ -1166,7 +1166,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
ret void
}
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-DENORM: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index e94aa4b8ce3d1..2ac181b06a350 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -1,14 +1,12 @@
; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
-
-; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
; Make sure we don't form mad with denormals
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
@@ -25,15 +23,41 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
+define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+ %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+ %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+ %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+ %a = load volatile float, ptr addrspace(1) %gep.0
+ %b = load volatile float, ptr addrspace(1) %gep.1
+ %c = load volatile float, ptr addrspace(1) %gep.2
+
+
+ %mul = fmul contract float %a, %b
+ %fma = fadd contract float %mul, %c
+ store float %fma, ptr addrspace(1) %gep.out
+ ret void
+}
+; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
+
; SI-DENORM-SLOWFMAF-NOT: v_fma
; SI-DENORM-SLOWFMAF-NOT: v_mad
; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
-; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-DENORM-SLOWFMAF: buffer_store_dword [[RESULT]]
; SI-STD: buffer_store_dword [[C]]
-define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @no_combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -72,7 +96,46 @@ define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, p
; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 {
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+ %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+ %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+ %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+ %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+ %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+ %a = load volatile float, ptr addrspace(1) %gep.0
+ %b = load volatile float, ptr addrspace(1) %gep.1
+ %c = load volatile float, ptr addrspace(1) %gep.2
+ %d = load volatile float, ptr addrspace(1) %gep.3
+
+ %mul = fmul contract fast float %a, %b
+ %fma0 = fadd contract fast float %mul, %c
+ %fma1 = fadd contract fast float %mul, %d
+ store volatile float %fma0, ptr addrspace(1) %gep.out.0
+ store volatile float %fma1, ptr addrspace(1) %gep.out.1
+ ret void
+}
+; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define amdgpu_kernel void @no_combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -89,7 +152,6 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %o
%mul = fmul float %a, %b
%fma0 = fadd float %mul, %c
%fma1 = fadd float %mul, %d
-
store volatile float %fma0, ptr addrspace(1) %gep.out.0
store volatile float %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -120,8 +182,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(ptr addrspace(1) noalias %out, p
%b = load volatile float, ptr addrspace(1) %gep.1
%c = load volatile float, ptr addrspace(1) %gep.2
- %mul = fmul float %a, %b
- %fma = fadd float %c, %mul
+ %mul = fmul contract float %a, %b
+ %fma = fadd contract float %c, %mul
store float %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -150,8 +212,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %o
%b = load volatile float, ptr addrspace(1) %gep.1
%c = load volatile float, ptr addrspace(1) %gep.2
- %mul = fmul float %a, %b
- %fma = fsub float %mul, %c
+ %mul = fmul contract float %a, %b
+ %fma = fsub contract float %mul, %c
store float %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -190,9 +252,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(ptr addrspace(1) noali
%c = load volatile float, ptr addrspace(1) %gep.2
%d = load volatile float, ptr addrspace(1) %gep.3
- %mul = fmul float %a, %b
- %fma0 = fsub float %mul, %c
- %fma1 = fsub float %mul, %d
+ %mul = fmul contract float %a, %b
+ %fma0 = fsub contract float %mul, %c
+ %fma1 = fsub contract float %mul, %d
store volatile float %fma0, ptr addrspace(1) %gep.out.0
store volatile float %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -222,8 +284,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %o
%b = load volatile float, ptr addrspace(1) %gep.1
%c = load volatile float, ptr addrspace(1) %gep.2
- %mul = fmul float %a, %b
- %fma = fsub float %c, %mul
+ %mul = fmul contract float %a, %b
+ %fma = fsub contract float %c, %mul
store float %fma, ptr addrspace(1) %gep.out
ret void
}
@@ -262,9 +324,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(ptr addrspace(1) noali
%c = load volatile float, ptr addrspace(1) %gep.2
%d = load volatile float, ptr addrspace(1) %gep.3
- %mul = fmul float %a, %b
- %fma0 = fsub float %c, %mul
- %fma1 = fsub float %d, %mul
+ %mul = fmul contract float %a, %b
+ %fma0 = fsub contract float %c, %mul
+ %fma1 = fsub contract float %d, %mul
store volatile float %fma0, ptr addrspace(1) %gep.out.0
store volatile float %fma1, ptr addrspace(1) %gep.out.1
ret void
@@ -295,9 +357,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %o
%b = load volatile float, ptr addrspace(1) %gep.1
%c = load volatile float, ptr addrspace(1) %gep.2
- %mul = fmul float %a, %b
- %mul.neg = fneg float %mul
- %fma = fsub float %mul.neg, %c
+ %mul = fmul contract float %a, %b
+ %mul.neg = fneg contract float %mul
+ %fma = fsub contract float %mul.neg, %c
store float %fma, ptr addrspace(1) %gep.out
ret void
@@ -337,10 +399,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(ptr addrspace(1)
%c = load volatile float, ptr addrspace(1) %gep.2
%d = load volatile float, ptr addrspace(1) %gep.3
- %mul = fmul float %a, %b
- %mul.neg = fneg float %mul
- %fma0 = fsub float %mul.neg, %c
- %fma1 = fsub float %mul.neg, %d
+ %mul = fmul contract float %a, %b
+ %mul.neg = fneg contract float %mul
+ %fma0 = fsub contract float %mul.neg, %c
+ %fma1 = fsub contract float %mul.neg, %d
store volatile float %fma0, ptr addrspace(1) %gep.out.0
store volatile float %fma1, ptr addrspace(1) %gep.out.1
@@ -381,10 +443,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
%c = load volatile float, ptr addrspace(1) %gep.2
%d = load volatile float, ptr addrspace(1) %gep.3
- %mul = fmul float %a, %b
- %mul.neg = fneg float %mul
- %fma0 = fsub float %mul.neg, %c
- %fma1 = fsub float %mul, %d
+ %mul = fmul contract float %a, %b
+ %mul.neg = fneg contract float %mul
+ %fma0 = fsub contract float %mul.neg, %c
+ %fma1 = fsub contract float %mul, %d
store volatile float %fma0, ptr addrspace(1) %gep.out.0
store volatile float %fma1, ptr addrspace(1) %gep.out.1
@@ -412,7 +474,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -427,10 +489,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1)
%u = load volatile float, ptr addrspace(1) %gep.3
%v = load volatile float, ptr addrspace(1) %gep.4
- %tmp0 = fmul float %u, %v
- %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
- %tmp2 = fsub float %tmp1, %z
+ br i1 %is_aggressive, label %aggressive, label %normal
+
+normal:
+ %tmp0_normal = fmul float %u, %v
+ %tmp1_normal = call float @llvm.fma.f32(float %x, float %y, float %tmp0_normal) #0
+ %tmp2_normal = fsub float %tmp1_normal, %z
+ br label %exit
+aggressive:
+ %tmp0_aggressive = fmul contract reassoc float %u, %v
+ %tmp1_aggressive = call contract reassoc float @llvm.fma.f32(float %x, float %y, float %tmp0_aggressive) #0
+ %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z
+ br label %exit
+
+exit:
+ %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
store float %tmp2, ptr addrspace(1) %gep.out
ret void
}
@@ -505,7 +579,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1)
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -520,10 +594,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1)
%u = load volatile float, ptr addrspace(1) %gep.3
%v = load volatile float, ptr addrspace(1) %gep.4
- %tmp0 = fmul float %u, %v
- %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
- %tmp2 = fsub float %tmp1, %z
+ br i1 %is_aggressive, label %aggressive, label %normal
+normal:
+ %tmp0_normal = fmul float %u, %v
+ %tmp1_normal = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_normal) #0
+ %tmp2_normal = fsub float %tmp1_normal, %z
+ br label %exit
+
+aggressive:
+ %tmp0_aggressive = fmul contract reassoc float %u, %v
+ %tmp1_aggressive = call contract reassoc float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_aggressive) #0
+ %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z
+ br label %exit
+
+exit:
+ %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
store float %tmp2, ptr addrspace(1) %gep.out
ret void
}
@@ -556,7 +642,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1)
; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -571,11 +657,23 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1)
%u = load volatile float, ptr addrspace(1) %gep.3
%v = load volatile float, ptr addrspace(1) %gep.4
- ; nsz flag is needed since this combine may change sign of zero
- %tmp0 = fmul nsz float %u, %v
- %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
- %tmp2 = fsub nsz float %x, %tmp1
+ br i1 %is_aggressive, label %aggressive, label %normal
+normal:
+ ; nsz flag is needed since this combine may change sign of zero
+ %tmp0_normal = fmul nsz float %u, %v
+ %tmp1_normal = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_normal) #0
+ %tmp2_normal = fsub nsz float %x, %tmp1_normal
+ br label %exit
+
+aggressive:
+ %tmp0_aggressive = fmul contract reassoc nsz float %u, %v
+ %tmp1_aggressive = call contract reassoc nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_aggressive) #0
+ %tmp2_aggressive = fsub contract reassoc nsz float %x, %tmp1_aggressive
+ br label %exit
+
+exit:
+ %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
store float %tmp2, ptr addrspace(1) %gep.out
ret void
}
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index 3d45e9a3a509c..456f85ad3eefd 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \
-; RUN: -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CHECK-FAST %s
+; RUN: < %s | FileCheck -check-prefix=CHECK-FAST %s
; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \
-; RUN: -enable-unsafe-fp-math -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s
+; RUN: -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s
; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
define dso_local double @fma_combine1(double %a, double %b, double %c) {
@@ -19,13 +19,12 @@ define dso_local double @fma_combine1(double %a, double %b, double %c) {
; CHECK-LABEL: fma_combine1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsnegdp 0, 3
-; CHECK-NEXT: xsmuldp 0, 0, 2
-; CHECK-NEXT: xssubdp 1, 0, 1
+; CHECK-NEXT: xsmsubadp 1, 0, 2
; CHECK-NEXT: blr
entry:
%fneg1 = fneg double %c
- %mul = fmul double %fneg1, %b
- %add = fsub double %mul, %a
+ %mul = fmul contract double %fneg1, %b
+ %add = fsub contract double %mul, %a
ret double %add
}
@@ -43,13 +42,12 @@ define dso_local double @fma_combine2(double %a, double %b, double %c) {
; CHECK-LABEL: fma_combine2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xsnegdp 0, 3
-; CHECK-NEXT: xsmuldp 0, 2, 0
-; CHECK-NEXT: xssubdp 1, 0, 1
+; CHECK-NEXT: xsmsubadp 1, 2, 0
; CHECK-NEXT: blr
entry:
%fneg1 = fneg double %c
- %mul = fmul double %b, %fneg1
- %add = fsub double %mul, %a
+ %mul = fmul contract double %b, %fneg1
+ %add = fsub contract double %mul, %a
ret double %add
}
@@ -85,17 +83,16 @@ define dso_local double @fma_combine_two_uses(double %a, double %b, double %c) {
; CHECK-NEXT: stfd 0, v at toc@l(3)
; CHECK-NEXT: xsnegdp 0, 3
; CHECK-NEXT: addis 3, 2, z at toc@ha
+; CHECK-NEXT: xsmsubadp 1, 0, 2
; CHECK-NEXT: stfd 0, z at toc@l(3)
-; CHECK-NEXT: xsmuldp 0, 0, 2
-; CHECK-NEXT: xssubdp 1, 0, 1
; CHECK-NEXT: blr
entry:
%fneg = fneg double %a
store double %fneg, ptr @v, align 8
%fneg1 = fneg double %c
store double %fneg1, ptr @z, align 8
- %mul = fmul double %fneg1, %b
- %add = fsub double %mul, %a
+ %mul = fmul contract double %fneg1, %b
+ %add = fsub contract double %mul, %a
ret double %add
}
@@ -122,15 +119,14 @@ define dso_local double @fma_combine_one_use(double %a, double %b, double %c) {
; CHECK-NEXT: addis 3, 2, v at toc@ha
; CHECK-NEXT: stfd 0, v at toc@l(3)
; CHECK-NEXT: xsnegdp 0, 3
-; CHECK-NEXT: xsmuldp 0, 0, 2
-; CHECK-NEXT: xssubdp 1, 0, 1
+; CHECK-NEXT: xsmsubadp 1, 0, 2
; CHECK-NEXT: blr
entry:
%fneg = fneg double %a
store double %fneg, ptr @v, align 8
%fneg1 = fneg double %c
- %mul = fmul double %fneg1, %b
- %add = fsub double %mul, %a
+ %mul = fmul contract double %fneg1, %b
+ %add = fsub contract double %mul, %a
ret double %add
}
@@ -327,15 +323,12 @@ define dso_local double @fma_combine_const(double %a, double %b) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis 3, 2, .LCPI9_0 at toc@ha
; CHECK-NEXT: lfd 0, .LCPI9_0 at toc@l(3)
-; CHECK-NEXT: addis 3, 2, .LCPI9_1 at toc@ha
-; CHECK-NEXT: xsmuldp 0, 1, 0
-; CHECK-NEXT: lfd 1, .LCPI9_1 at toc@l(3)
-; CHECK-NEXT: xsmaddadp 2, 0, 1
+; CHECK-NEXT: xsmaddadp 2, 1, 0
; CHECK-NEXT: fmr 1, 2
; CHECK-NEXT: blr
entry:
- %0 = fmul double %a, 1.1
- %1 = call contract double @llvm.fma.f64(double %0, double 2.1, double %b)
+ %0 = fmul reassoc double %a, 1.1
+ %1 = call contract reassoc double @llvm.fma.f64(double %0, double 2.1, double %b)
ret double %1
}
>From 8aa53e4470f3b7c79f7a2acb04526e4bb9d22f98 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 12:24:00 +0800
Subject: [PATCH 2/3] remove `UnsafeFPMath` usage in
`visitFMULForFMADistributiveCombine`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9e6e81e2c0dee..7cf3a8f7a07b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16737,7 +16737,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
assert(N.getOpcode() == ISD::FMUL);
- return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ return Options.AllowFPOpFusion == FPOpFusion::Fast ||
N->getFlags().hasAllowContract();
}
@@ -17338,8 +17338,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
// Floating-point multiply-add with intermediate rounding. This can result
// in a less precise result due to the changed rounding order.
- bool HasFMAD = Options.UnsafeFPMath &&
- (LegalOperations && TLI.isFMADLegal(DAG, N));
+ bool HasFMAD = LegalOperations && TLI.isFMADLegal(DAG, N);
// No valid opcode, do not combine.
if (!HasFMAD && !HasFMA)
>From ca8c2e90f171944ab26d0175e0cbc5a697390187 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 13:24:47 +0800
Subject: [PATCH 3/3] remove `UnsafeFPMath` usage in `visitFDIV`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7cf3a8f7a07b7..6c7b1499664b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18237,8 +18237,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if (((st == APFloat::opOK && !Recip.isDenormal()) ||
- (st == APFloat::opInexact &&
- (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
+ (st == APFloat::opInexact && Flags.hasAllowReciprocal())) &&
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
@@ -18249,7 +18248,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
DAG.getConstantFP(Recip, DL, VT));
}
- if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
+ if (Flags.hasAllowReciprocal()) {
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
@@ -18324,7 +18323,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
// Fold X/Sqrt(X) -> Sqrt(X)
if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
- (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
+ Flags.hasAllowReassociation())
if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
return N1;
More information about the llvm-commits
mailing list