[llvm] [SelectionDAG] Remove UnsafeFPMath check in `visitFADDForFMACombine` (PR #127770)

Wed Feb 19 05:15:32 PST 2025

https://github.com/paperchalice updated https://github.com/llvm/llvm-project/pull/127770

>From 1044329cd31af4762a5c0fb42afa3c86589bf633 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 16:57:20 +0800
Subject: [PATCH 1/5] [SelectionDAG] Remove UnsafeFPMath check in
 `visitFADDForFMACombine`

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp        |  4 ++--
 llvm/test/CodeGen/AMDGPU/fdot2.ll                    |  8 ++++----
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll        | 12 ++++++------
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll        |  8 ++++----
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll         | 10 +++++-----
 llvm/test/CodeGen/PowerPC/fmf-propagation.ll         |  4 ++--
 .../CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll   |  2 +-
 llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll           |  2 +-
 llvm/test/CodeGen/X86/sqrt-fastmath.ll               |  6 +++---
 9 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f4caaf426de6a..00f83f4a100d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16320,8 +16320,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
-  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                              Options.UnsafeFPMath || HasFMAD);
+  bool AllowFusionGlobally =
+      Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
   // If the addition is not contractable, do not combine.
   if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
     return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 776816d6aa0e3..15e161d718e80 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad..610cda4933adf 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
 
 ; --------------------------------------------------------------------------------
 ; fadd tests
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 9a72fe96b5c3a..2ae2047d08a68 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI,VI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI,VI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s
 
 ; --------------------------------------------------------------------------------
 ; fadd tests
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 7dce894620e6b..eb8538c791a37 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | %ptxas-verify -arch=sm_80 %}
 
 ; Using FTZ should emit fma.ftz.relu for f16, not for bf16
-; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
-; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -fp-contract=fast -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 
 ; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-SM70
 
 define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 {
 ; CHECK-LABEL: fma_f16_expanded_no_nans(
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index 4e72a5ac5ede3..e71f59c79ce4d 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -2,8 +2,8 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1                        | FileCheck %s --check-prefix=FMFDEBUG
 ; RUN: llc < %s -mtriple=powerpc64le                                                           | FileCheck %s --check-prefix=FMF
-; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
-; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
+; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
+; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
 
 ; Test FP transforms using instruction/node-level fast-math-flags.
 ; We're also checking debug output to verify that FMF is propagated to the newly created nodes.
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
index 96aa58000f9b7..8999be58a3581 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -fp-contract=fast < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 2c7da100344b7..5b7f70180caa6 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel -fp-contract=fast 2>&1 | FileCheck %s
 
 declare float @llvm.sqrt.f32(float) #2
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb9..fed87d55b7e3f 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx -fp-contract=fast  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
 
 declare double @__sqrt_finite(double)
 declare float @__sqrtf_finite(float)

>From 4ec74b18e66bd96eabf0c330acccef68017930e0 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 18:10:10 +0800
Subject: [PATCH 2/5] Drop "unsafe-fp-math" in some CodeGen tests. Regression
 failures are expected...

---
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll        |  2 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll        |  2 +-
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll         |  4 ++--
 .../CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll   |  2 +-
 llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll           |  4 ++--
 llvm/test/CodeGen/X86/sqrt-fastmath.ll               | 12 ++++++------
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 610cda4933adf..f303d649a91e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -4758,6 +4758,6 @@ declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 2ae2047d08a68..56b79e05823d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3440,6 +3440,6 @@ declare half @llvm.amdgcn.rcp.f16(half) #1
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index eb8538c791a37..e433c58e86024 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -1221,5 +1221,5 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
   ret <2 x bfloat> %3
 }
 
-attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { "unsafe-fp-math"="true" }
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { }
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
index 8999be58a3581..539b563691723 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
@@ -31,7 +31,7 @@ declare double @llvm.sqrt.f64(double) #1
 
 declare signext i32 @p_col_helper(...) #2
 
-attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 5b7f70180caa6..df0b2e1185e07 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -144,6 +144,6 @@ define float @rsqrt_daz(float %f) #1 {
   ret float %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #0 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" }
+attributes #1 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index fed87d55b7e3f..00ee932e73eb8 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1005,10 +1005,10 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
   ret double %sqrt_fast
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
 attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }

>From dc45d588f3e68699492c69f34ed89308997751be Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 20:54:07 +0800
Subject: [PATCH 3/5] drop empty attribute group

---
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index e433c58e86024..84508364b14d7 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -119,7 +119,7 @@ define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, hal
   ret half %6
 }
 
-define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 {
+define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) {
 ; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
@@ -216,7 +216,7 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 {
   ret half %3
 }
 
-define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 {
+define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) {
 ; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
@@ -647,7 +647,7 @@ define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a
   ret <2 x half> %6
 }
 
-define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 {
+define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
 ; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -754,7 +754,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
   ret <2 x half> %3
 }
 
-define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 {
+define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
@@ -1222,4 +1222,3 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 }
 
 attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
-attributes #1 = { }

>From feeee6eaee37ba627069929781d253b683ad6e13 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 20:55:52 +0800
Subject: [PATCH 4/5] Regenerate tests. DO NOT MERGE: Some fma instructions are
 not optimized!

---
 llvm/test/CodeGen/AMDGPU/fdot2.ll             | 10 +--
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 41 +++++----
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 89 ++++++++++++++-----
 llvm/test/CodeGen/X86/sqrt-fastmath.ll        | 11 ++-
 4 files changed, 103 insertions(+), 48 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 15e161d718e80..56c9517a858b5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -1,11 +1,11 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED
 ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
 
 ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index f303d649a91e6..56890d2e89097 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -530,7 +530,7 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; VI-SAFE-NEXT:    v_rcp_f16_e32 v0, s1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT:    v_add_f16_e32 v0, 0, v0
+; VI-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, s0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
@@ -557,7 +557,7 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; GFX11-SAFE-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_add_f16_e32 v0, 0, v0
+; GFX11-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, s0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -597,13 +597,17 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
 ; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
 ; SI-SAFE:       ; %bb.0: ; %.entry
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, s0
-; SI-SAFE-NEXT:    s_brev_b32 s0, 1
-; SI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, s0
+; SI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, 0, v0
-; SI-SAFE-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; SI-SAFE-NEXT:    v_fma_f32 v0, v0, 0, 0
+; SI-SAFE-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; SI-SAFE-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; SI-SAFE-NEXT:    ; return to shader part epilog
 ;
 ; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
@@ -623,10 +627,13 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
 ;
 ; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
 ; VI-SAFE:       ; %bb.0: ; %.entry
-; VI-SAFE-NEXT:    v_mov_b32_e32 v0, 0x8000
+; VI-SAFE-NEXT:    v_rcp_f16_e32 v0, s1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v1, s0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e64 vcc, s0, 0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
+; VI-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
+; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, s0, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x7e00
 ; VI-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc, 0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -646,12 +653,16 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
 ;
 ; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
 ; GFX11-SAFE:       ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 vcc_lo, s0, 0
+; GFX11-SAFE-NEXT:    v_rcp_f16_e32 v0, s1
+; GFX11-SAFE-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, s0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
 ; GFX11-SAFE-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 56b79e05823d6..32d33d9049c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -291,10 +291,14 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
 define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #2 {
 ; SI-SAFE-LABEL: fneg_fadd_0_nsz_f32:
 ; SI-SAFE:       ; %bb.0: ; %.entry
-; SI-SAFE-NEXT:    v_min_legacy_f32_e64 v0, 0, s0
-; SI-SAFE-NEXT:    s_brev_b32 s0, 1
-; SI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT:    v_rcp_f32_e32 v0, s1
+; SI-SAFE-NEXT:    v_mov_b32_e32 v1, s0
+; SI-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
+; SI-SAFE-NEXT:    v_fma_f32 v0, v0, 0, 0
 ; SI-SAFE-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; SI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; SI-SAFE-NEXT:    ; return to shader part epilog
 ;
@@ -564,7 +568,7 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v3, s0
 ; SI-SAFE-NEXT:    v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0
 ; SI-SAFE-NEXT:    v_mul_f64 v[0:1], v[0:1], 0
-; SI-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], 0
+; SI-SAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], 0, 0
 ; SI-SAFE-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; SI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
@@ -618,7 +622,7 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, s1
 ; VI-SAFE-NEXT:    v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0
 ; VI-SAFE-NEXT:    v_mul_f64 v[0:1], v[0:1], 0
-; VI-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], 0
+; VI-SAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], 0, 0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v1
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
@@ -672,17 +676,28 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
 ; function attribute unsafe-fp-math automatically. Combine with the previous test
 ; when that is done.
 define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %tmp6, <4 x i32> %arg) #2 {
-; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f64:
-; GCN-SAFE:       ; %bb.0: ; %.entry
-; GCN-SAFE-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[0:1], 0
-; GCN-SAFE-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GCN-SAFE-NEXT:    s_cselect_b32 s1, s1, 0x80000000
-; GCN-SAFE-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-SAFE-NEXT:    v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0
-; GCN-SAFE-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GCN-SAFE-NEXT:    s_cselect_b32 s1, 0, 0x7ff80000
-; GCN-SAFE-NEXT:    s_mov_b32 s0, 0
-; GCN-SAFE-NEXT:    ; return to shader part epilog
+; SI-SAFE-LABEL: fneg_fadd_0_nsz_f64:
+; SI-SAFE:       ; %bb.0: ; %.entry
+; SI-SAFE-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; SI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT:    v_mov_b32_e32 v2, s1
+; SI-SAFE-NEXT:    v_mul_f64 v[0:1], v[0:1], 0
+; SI-SAFE-NEXT:    v_mov_b32_e32 v3, s0
+; SI-SAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], 0, 0
+; SI-SAFE-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SAFE-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
+; SI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-SAFE-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; SI-SAFE-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT:    s_cselect_b32 s1, 0, 0x7ff80000
+; SI-SAFE-NEXT:    s_mov_b32 s0, 0
+; SI-SAFE-NEXT:    ; return to shader part epilog
 ;
 ; SI-NSZ-LABEL: fneg_fadd_0_nsz_f64:
 ; SI-NSZ:       ; %bb.0: ; %.entry
@@ -707,6 +722,29 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t
 ; SI-NSZ-NEXT:    s_mov_b32 s0, 0
 ; SI-NSZ-NEXT:    ; return to shader part epilog
 ;
+; VI-SAFE-LABEL: fneg_fadd_0_nsz_f64:
+; VI-SAFE:       ; %bb.0: ; %.entry
+; VI-SAFE-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-SAFE-NEXT:    v_mov_b32_e32 v4, s0
+; VI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT:    v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT:    v_mov_b32_e32 v2, s1
+; VI-SAFE-NEXT:    v_mul_f64 v[0:1], v[0:1], 0
+; VI-SAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], 0, 0
+; VI-SAFE-NEXT:    v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v1
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-SAFE-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; VI-SAFE-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SAFE-NEXT:    s_cselect_b32 s1, 0, 0x7ff80000
+; VI-SAFE-NEXT:    s_mov_b32 s0, 0
+; VI-SAFE-NEXT:    ; return to shader part epilog
+;
 ; VI-NSZ-LABEL: fneg_fadd_0_nsz_f64:
 ; VI-NSZ:       ; %bb.0: ; %.entry
 ; VI-NSZ-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
@@ -3379,18 +3417,25 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 }
 
 define float @v_fmul_0_fsub_0_infloop_regression(float %arg) {
-; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
-; GCN-SAFE:       ; %bb.0: ; %bb
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
-; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, 0, v0
-; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; SI-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+; SI-SAFE:       ; %bb.0: ; %bb
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SAFE-NEXT:    s_brev_b32 s4, 1
+; SI-SAFE-NEXT:    v_fma_f32 v0, v0, s4, 0
+; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression:
 ; GCN-NSZ:       ; %bb.0: ; %bb
 ; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+; VI-SAFE:       ; %bb.0: ; %bb
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
+; VI-SAFE-NEXT:    v_sub_f32_e32 v0, 0, v0
+; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul float %arg, 0.0
   %i1 = fsub float 0.0, %i
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 00ee932e73eb8..de9c5f4ef08c6 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -740,13 +740,12 @@ define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x
 ; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm2, %xmm3
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT:    vmulps %xmm4, %xmm3, %xmm4
 ; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vmulps %xmm2, %xmm4, %xmm2
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm4 = (xmm3 * xmm2) + xmm4
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX512-NEXT:    vmulps %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmulps %xmm4, %xmm2, %xmm2
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
 ; AVX512-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX512-NEXT:    vdivps %xmm1, %xmm2, %xmm1

>From 97ba3bf8a930deff5f824e8e64284fa670317370 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 21:15:07 +0800
Subject: [PATCH 5/5] Follow the comment before fneg_fadd_0_nsz_f16

---
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 124 ++++++------------
 1 file changed, 40 insertions(+), 84 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 56890d2e89097..efe780c77a47b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -595,89 +595,45 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; function attribute unsafe-fp-math automatically. Combine with the previous test
 ; when that is done.
 define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
-; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; SI-SAFE:       ; %bb.0: ; %.entry
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, s1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; SI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_fma_f32 v0, v0, 0, 0
-; SI-SAFE-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-SAFE-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; SI-SAFE-NEXT:    ; return to shader part epilog
-;
-; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; SI-NSZ:       ; %bb.0: ; %.entry
-; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, s1
-; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; SI-NSZ-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT:    v_rcp_f32_e32 v0, v0
-; SI-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; SI-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, v1
-; SI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
-; SI-NSZ-NEXT:    ; return to shader part epilog
-;
-; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; VI-SAFE:       ; %bb.0: ; %.entry
-; VI-SAFE-NEXT:    v_rcp_f16_e32 v0, s1
-; VI-SAFE-NEXT:    v_mov_b32_e32 v1, s0
-; VI-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
-; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, s0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x7e00
-; VI-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-SAFE-NEXT:    ; return to shader part epilog
-;
-; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; VI-NSZ:       ; %bb.0: ; %.entry
-; VI-NSZ-NEXT:    v_rcp_f16_e32 v0, s1
-; VI-NSZ-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NSZ-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
-; VI-NSZ-NEXT:    v_cmp_nlt_f16_e64 vcc, -v0, s0
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7e00
-; VI-NSZ-NEXT:    v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-NSZ-NEXT:    ; return to shader part epilog
-;
-; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-SAFE:       ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT:    v_rcp_f16_e32 v0, s1
-; GFX11-SAFE-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_fma_f16 v0, v0, 0, 0
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, s0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-SAFE-NEXT:    ; return to shader part epilog
-;
-; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-NSZ:       ; %bb.0: ; %.entry
-; GFX11-NSZ-NEXT:    v_rcp_f16_e32 v0, s1
-; GFX11-NSZ-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NSZ-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-NSZ-NEXT:    ; return to shader part epilog
+; SI-LABEL: fneg_fadd_0_nsz_f16:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, v1
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: fneg_fadd_0_nsz_f16:
+; VI:       ; %bb.0: ; %.entry
+; VI-NEXT:    v_rcp_f16_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
+; VI-NEXT:    v_cmp_nlt_f16_e64 vcc, -v0, s0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11:       ; %bb.0: ; %.entry
+; GFX11-NEXT:    v_rcp_f16_e32 v0, s1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nlt_f16_e64 s1, -v0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NEXT:    ; return to shader part epilog
 .entry:
   %tmp7 = fdiv afn half 1.000000e+00, %tmp6
   %tmp8 = fmul half 0.000000e+00, %tmp7
@@ -4769,6 +4725,6 @@ declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
+attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }