[llvm] [DAGCombiner] Remove `UnsafeFPMath` usage in `visitFSUBForFMACombine` etc. (PR #145637)

Thu Jun 26 02:09:20 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: None (paperchalice)

<details>
<summary>Changes</summary>

Remove `UnsafeFPMath` in `visitFMULForFMADistributiveCombine`, `visitFSUBForFMACombine` and `visitFDIV`.
All affected tests are fixed by add fast math flags manually.
Propagate fast math flags when lowering fdiv in NVPTX backend, so it can produce optimized dag when `unsafe-fp-math` is absent.

---

Patch is 276.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145637.diff


12 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+9-11) 
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+8-7) 
- (modified) llvm/test/CodeGen/AMDGPU/fma-combine.ll (+192-150) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-combine.ll (+144-46) 
- (added) llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll (+1956) 
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f32.ll (+365-1829) 
- (modified) llvm/test/CodeGen/NVPTX/frem.ll (+32-5) 
- (modified) llvm/test/CodeGen/NVPTX/sqrt-approx.ll (+8-8) 
- (modified) llvm/test/CodeGen/PowerPC/fma-combine.ll (+17-24) 
- (modified) llvm/test/CodeGen/X86/change-unsafe-fp-math.ll (+24-10) 
- (modified) llvm/test/CodeGen/X86/fdiv.ll (+2-2) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 91f696e8fe88e..6c7b1499664b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16737,7 +16737,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
 static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
   assert(N.getOpcode() == ISD::FMUL);
 
-  return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+  return Options.AllowFPOpFusion == FPOpFusion::Fast ||
          N->getFlags().hasAllowContract();
 }
 
@@ -17010,8 +17010,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     return SDValue();
 
   const SDNodeFlags Flags = N->getFlags();
-  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                              Options.UnsafeFPMath || HasFMAD);
+  bool AllowFusionGlobally =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD);
 
   // If the subtraction is not contractable, do not combine.
   if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
@@ -17167,7 +17167,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   auto isReassociable = [&Options](SDNode *N) {
-    return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+    return N->getFlags().hasAllowReassociation();
   };
 
   auto isContractableAndReassociableFMUL = [&isContractableFMUL,
@@ -17181,7 +17181,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // More folding opportunities when target permits.
   if (Aggressive && isReassociable(N)) {
-    bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
+    bool CanFuse = N->getFlags().hasAllowContract();
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (CanFuse && isFusedOp(N0) &&
@@ -17338,8 +17338,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
-  bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isFMADLegal(DAG, N));
+  bool HasFMAD = LegalOperations && TLI.isFMADLegal(DAG, N);
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
@@ -18238,8 +18237,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // Only do the transform if the reciprocal is a legal fp immediate that
     // isn't too nasty (eg NaN, denormal, ...).
     if (((st == APFloat::opOK && !Recip.isDenormal()) ||
-         (st == APFloat::opInexact &&
-          (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
+         (st == APFloat::opInexact && Flags.hasAllowReciprocal())) &&
         (!LegalOperations ||
          // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
          // backend)... we should handle this gracefully after Legalize.
@@ -18250,7 +18248,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
                          DAG.getConstantFP(Recip, DL, VT));
   }
 
-  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
+  if (Flags.hasAllowReciprocal()) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
@@ -18325,7 +18323,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // Fold X/Sqrt(X) -> Sqrt(X)
   if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
-      (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
+      Flags.hasAllowReassociation())
     if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
       return N1;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d2fafe854e9e4..9b43c6e326bf2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2857,15 +2857,16 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
   SDValue X = Op->getOperand(0);
   SDValue Y = Op->getOperand(1);
   EVT Ty = Op.getValueType();
+  SDNodeFlags Flags = Op->getFlags();
 
-  SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y);
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div);
-  SDValue Mul =
-      DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y, SDNodeFlags::AllowContract);
-  SDValue Sub =
-      DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, SDNodeFlags::AllowContract);
+  SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
+  SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
+                            Flags | SDNodeFlags::AllowContract);
+  SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
+                            Flags | SDNodeFlags::AllowContract);
 
-  if (AllowUnsafeFPMath || Op->getFlags().hasNoInfs())
+  if (AllowUnsafeFPMath || Flags.hasNoInfs())
     return Sub;
 
   // If Y is infinite, return X
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index a96d022b66f12..c79cf87712dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,11 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
-
-; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs  -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs  < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
 
 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -65,8 +63,8 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fadd double %mul, %c
+  %mul = fmul contract double %a, %b
+  %fma = fadd contract double %mul, %c
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -134,9 +132,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fadd double %mul, %c
-  %fma1 = fadd double %mul, %d
+  %mul = fmul contract double %a, %b
+  %fma0 = fadd contract double %mul, %c
+  %fma1 = fadd contract double %mul, %d
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -190,8 +188,8 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fadd double %c, %mul
+  %mul = fmul contract double %a, %b
+  %fma = fadd contract double %c, %mul
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -244,8 +242,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fsub double %mul, %c
+  %mul = fmul contract double %a, %b
+  %fma = fsub contract double %mul, %c
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -313,9 +311,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %mul, %c
-  %fma1 = fsub double %mul, %d
+  %mul = fmul contract double %a, %b
+  %fma0 = fsub contract double %mul, %c
+  %fma1 = fsub contract double %mul, %d
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -369,8 +367,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fsub double %c, %mul
+  %mul = fmul contract double %a, %b
+  %fma = fsub contract double %c, %mul
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -438,9 +436,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %c, %mul
-  %fma1 = fsub double %d, %mul
+  %mul = fmul contract double %a, %b
+  %fma0 = fsub contract double %c, %mul
+  %fma1 = fsub contract double %d, %mul
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -494,9 +492,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma = fsub double %mul.neg, %c
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma = fsub contract double %mul.neg, %c
 
   store double %fma, ptr addrspace(1) %gep.out
   ret void
@@ -565,10 +563,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul.neg, %d
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma0 = fsub contract double %mul.neg, %c
+  %fma1 = fsub contract double %mul.neg, %d
 
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -638,10 +636,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul, %d
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma0 = fsub contract double %mul.neg, %c
+  %fma1 = fsub contract double %mul, %d
 
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -650,32 +648,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
 
 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; SI-NOFMA:       ; %bb.0:
-; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT:    s_mov_b32 s6, 0
-; SI-NOFMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NOFMA-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
-; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NOFMA-NEXT:    s_endpgm
-;
 ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; SI-FMA:       ; %bb.0:
 ; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -701,30 +673,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ; SI-FMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-FMA-NEXT:    s_endpgm
 ;
-; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_endpgm
-;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-FMA:       ; %bb.0:
 ; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -761,18 +709,16 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
   %u = load volatile double, ptr addrspace(1) %gep.3
   %v = load volatile double, ptr addrspace(1) %gep.4
 
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
-  %tmp2 = fsub double %tmp1, %z
+  %tmp0 = fmul contract fast double %u, %v
+  %tmp1 = call contract fast double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub contract fast double %tmp1, %z
 
   store double %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 
-; fold (fsub x, (fma y, z, (fmul u, v)))
-;   -> (fma (fneg y), z, (fma (fneg u), v, x))
-define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
 ; SI-NOFMA:       ; %bb.0:
 ; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
@@ -793,11 +739,59 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
 ; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-NOFMA-NEXT:    s_endpgm
 ;
+; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
+; GFX11-NOFMA:       ; %bb.0:
+; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT:    s_endpgm
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile double, ptr addrspace(1) %gep.0
+  %y = load volatile double, ptr addrspace(1) %gep.1
+  %z = load volatile double, ptr addrspace(1) %gep.2
+  %u = load volatile double, ptr addrspace(1) %gep.3
+  %v = load volatile double, ptr addrspace(1) %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, ptr addrs...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/145637