[llvm] [SelectionDAG] Remove UnsafeFPMath check in `visitFADDForFMACombine` (PR #127770)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 05:15:32 PST 2025
https://github.com/paperchalice updated https://github.com/llvm/llvm-project/pull/127770
>From 1044329cd31af4762a5c0fb42afa3c86589bf633 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 16:57:20 +0800
Subject: [PATCH 1/5] [SelectionDAG] Remove UnsafeFPMath check in
`visitFADDForFMACombine`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
llvm/test/CodeGen/AMDGPU/fdot2.ll | 8 ++++----
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 12 ++++++------
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 8 ++++----
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 10 +++++-----
llvm/test/CodeGen/PowerPC/fmf-propagation.ll | 4 ++--
.../CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll | 2 +-
llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 2 +-
llvm/test/CodeGen/X86/sqrt-fastmath.ll | 6 +++---
9 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f4caaf426de6a..00f83f4a100d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16320,8 +16320,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
if (!HasFMAD && !HasFMA)
return SDValue();
- bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath || HasFMAD);
+ bool AllowFusionGlobally =
+ Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 776816d6aa0e3..15e161d718e80 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad..610cda4933adf 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
; --------------------------------------------------------------------------------
; fadd tests
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 9a72fe96b5c3a..2ae2047d08a68 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI,VI-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI,VI-SAFE %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s
; --------------------------------------------------------------------------------
; fadd tests
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 7dce894620e6b..eb8538c791a37 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | %ptxas-verify -arch=sm_80 %}
; Using FTZ should emit fma.ftz.relu for f16, not for bf16
-; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ
-; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-FTZ
+; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -fp-contract=fast -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-SM70
define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 {
; CHECK-LABEL: fma_f16_expanded_no_nans(
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index 4e72a5ac5ede3..e71f59c79ce4d 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -2,8 +2,8 @@
; REQUIRES: asserts
; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 | FileCheck %s --check-prefix=FMFDEBUG
; RUN: llc < %s -mtriple=powerpc64le | FileCheck %s --check-prefix=FMF
-; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
-; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
+; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
+; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
; Test FP transforms using instruction/node-level fast-math-flags.
; We're also checking debug output to verify that FMF is propagated to the newly created nodes.
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
index 96aa58000f9b7..8999be58a3581 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -fp-contract=fast < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 2c7da100344b7..5b7f70180caa6 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel -fp-contract=fast 2>&1 | FileCheck %s
declare float @llvm.sqrt.f32(float) #2
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 384f8b832afb9..fed87d55b7e3f 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
declare double @__sqrt_finite(double)
declare float @__sqrtf_finite(float)
>From 4ec74b18e66bd96eabf0c330acccef68017930e0 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 18:10:10 +0800
Subject: [PATCH 2/5] Drop "unsafe-fp-math" in some CodeGen tests. Regression
failures are expected...
---
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 2 +-
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 4 ++--
.../CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll | 2 +-
llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 4 ++--
llvm/test/CodeGen/X86/sqrt-fastmath.ll | 12 ++++++------
6 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 610cda4933adf..f303d649a91e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -4758,6 +4758,6 @@ declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 2ae2047d08a68..56b79e05823d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3440,6 +3440,6 @@ declare half @llvm.amdgcn.rcp.f16(half) #1
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index eb8538c791a37..e433c58e86024 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -1221,5 +1221,5 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
ret <2 x bfloat> %3
}
-attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { "unsafe-fp-math"="true" }
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { }
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
index 8999be58a3581..539b563691723 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll
@@ -31,7 +31,7 @@ declare double @llvm.sqrt.f64(double) #1
declare signext i32 @p_col_helper(...) #2
-attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 5b7f70180caa6..df0b2e1185e07 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -144,6 +144,6 @@ define float @rsqrt_daz(float %f) #1 {
ret float %div
}
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #0 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" }
+attributes #1 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index fed87d55b7e3f..00ee932e73eb8 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1005,10 +1005,10 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
ret double %sqrt_fast
}
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
>From dc45d588f3e68699492c69f34ed89308997751be Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 20:54:07 +0800
Subject: [PATCH 3/5] drop empty attribute group
---
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index e433c58e86024..84508364b14d7 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -119,7 +119,7 @@ define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, hal
ret half %6
}
-define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 {
+define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) {
; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<7>;
@@ -216,7 +216,7 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 {
ret half %3
}
-define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 {
+define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<7>;
@@ -647,7 +647,7 @@ define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a
ret <2 x half> %6
}
-define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 {
+define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<7>;
@@ -754,7 +754,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
ret <2 x half> %3
}
-define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 {
+define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<7>;
@@ -1222,4 +1222,3 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
}
attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
-attributes #1 = { }
>From feeee6eaee37ba627069929781d253b683ad6e13 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 20:55:52 +0800
Subject: [PATCH 4/5] Regenerate tests. DO NOT MERGE: Some fma instructions are
not optimized!
---
llvm/test/CodeGen/AMDGPU/fdot2.ll | 10 +--
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 41 +++++----
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 89 ++++++++++++++-----
llvm/test/CodeGen/X86/sqrt-fastmath.ll | 11 ++-
4 files changed, 103 insertions(+), 48 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 15e161d718e80..56c9517a858b5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -1,11 +1,11 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=fast -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index f303d649a91e6..56890d2e89097 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -530,7 +530,7 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
+; VI-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
@@ -557,7 +557,7 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
+; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -597,13 +597,17 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
; SI-SAFE: ; %bb.0: ; %.entry
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-SAFE-NEXT: s_brev_b32 s0, 1
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0
+; SI-SAFE-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0
-; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SAFE-NEXT: v_rcp_f32_e32 v0, v0
+; SI-SAFE-NEXT: v_fma_f32 v0, v0, 0, 0
+; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
+; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
+; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; SI-SAFE-NEXT: ; return to shader part epilog
;
; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
@@ -623,10 +627,13 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
;
; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
; VI-SAFE: ; %bb.0: ; %.entry
-; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000
+; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
-; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
+; VI-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
+; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
+; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -646,12 +653,16 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
;
; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
; GFX11-SAFE: ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0
+; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
+; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
+; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
; GFX11-SAFE-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 56b79e05823d6..32d33d9049c2a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -291,10 +291,14 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #2 {
; SI-SAFE-LABEL: fneg_fadd_0_nsz_f32:
; SI-SAFE: ; %bb.0: ; %.entry
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, 0, s0
-; SI-SAFE-NEXT: s_brev_b32 s0, 1
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT: v_rcp_f32_e32 v0, s1
+; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0
+; SI-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0
+; SI-SAFE-NEXT: v_fma_f32 v0, v0, 0, 0
; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
+; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; SI-SAFE-NEXT: ; return to shader part epilog
;
@@ -564,7 +568,7 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
; SI-SAFE-NEXT: v_mov_b32_e32 v3, s0
; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0
; SI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0
-; SI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0
+; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[0:1], 0, 0
; SI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
; SI-SAFE-NEXT: v_xor_b32_e32 v4, 0x80000000, v1
; SI-SAFE-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
@@ -618,7 +622,7 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
; VI-SAFE-NEXT: v_mov_b32_e32 v2, s1
; VI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0
; VI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0
-; VI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0
+; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[0:1], 0, 0
; VI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v1
; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
@@ -672,17 +676,28 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6,
; function attribute unsafe-fp-math automatically. Combine with the previous test
; when that is done.
define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %tmp6, <4 x i32> %arg) #2 {
-; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f64:
-; GCN-SAFE: ; %bb.0: ; %.entry
-; GCN-SAFE-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[0:1], 0
-; GCN-SAFE-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GCN-SAFE-NEXT: s_cselect_b32 s1, s1, 0x80000000
-; GCN-SAFE-NEXT: s_cselect_b32 s0, s0, 0
-; GCN-SAFE-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0
-; GCN-SAFE-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GCN-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000
-; GCN-SAFE-NEXT: s_mov_b32 s0, 0
-; GCN-SAFE-NEXT: ; return to shader part epilog
+; SI-SAFE-LABEL: fneg_fadd_0_nsz_f64:
+; SI-SAFE: ; %bb.0: ; %.entry
+; SI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; SI-SAFE-NEXT: v_mov_b32_e32 v2, s1
+; SI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0
+; SI-SAFE-NEXT: v_mov_b32_e32 v3, s0
+; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[0:1], 0, 0
+; SI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SAFE-NEXT: v_xor_b32_e32 v4, 0x80000000, v1
+; SI-SAFE-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
+; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; SI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000
+; SI-SAFE-NEXT: s_mov_b32 s0, 0
+; SI-SAFE-NEXT: ; return to shader part epilog
;
; SI-NSZ-LABEL: fneg_fadd_0_nsz_f64:
; SI-NSZ: ; %bb.0: ; %.entry
@@ -707,6 +722,29 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t
; SI-NSZ-NEXT: s_mov_b32 s0, 0
; SI-NSZ-NEXT: ; return to shader part epilog
;
+; VI-SAFE-LABEL: fneg_fadd_0_nsz_f64:
+; VI-SAFE: ; %bb.0: ; %.entry
+; VI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-SAFE-NEXT: v_mov_b32_e32 v4, s0
+; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
+; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-SAFE-NEXT: v_mov_b32_e32 v2, s1
+; VI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0
+; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[0:1], 0, 0
+; VI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v1
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
+; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec
+; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000
+; VI-SAFE-NEXT: s_mov_b32 s0, 0
+; VI-SAFE-NEXT: ; return to shader part epilog
+;
; VI-NSZ-LABEL: fneg_fadd_0_nsz_f64:
; VI-NSZ: ; %bb.0: ; %.entry
; VI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
@@ -3379,18 +3417,25 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
}
define float @v_fmul_0_fsub_0_infloop_regression(float %arg) {
-; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
-; GCN-SAFE: ; %bb.0: ; %bb
-; GCN-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0
-; GCN-SAFE-NEXT: v_sub_f32_e32 v0, 0, v0
-; GCN-SAFE-NEXT: s_setpc_b64 s[30:31]
+; SI-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+; SI-SAFE: ; %bb.0: ; %bb
+; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SAFE-NEXT: s_brev_b32 s4, 1
+; SI-SAFE-NEXT: v_fma_f32 v0, v0, s4, 0
+; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression:
; GCN-NSZ: ; %bb.0: ; %bb
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
; GCN-NSZ-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+; VI-SAFE: ; %bb.0: ; %bb
+; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0
+; VI-SAFE-NEXT: v_sub_f32_e32 v0, 0, v0
+; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul float %arg, 0.0
%i1 = fsub float 0.0, %i
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 00ee932e73eb8..de9c5f4ef08c6 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -740,13 +740,12 @@ define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x
; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
; AVX512: # %bb.0:
; AVX512-NEXT: vrsqrtps %xmm2, %xmm3
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4
; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
-; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
+; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm4 = (xmm3 * xmm2) + xmm4
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX512-NEXT: vmulps %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm2
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1
>From 97ba3bf8a930deff5f824e8e64284fa670317370 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Wed, 19 Feb 2025 21:15:07 +0800
Subject: [PATCH 5/5] Follow the comment before fneg_fadd_0_nsz_f16
---
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 124 ++++++------------
1 file changed, 40 insertions(+), 84 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 56890d2e89097..efe780c77a47b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -595,89 +595,45 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; function attribute unsafe-fp-math automatically. Combine with the previous test
; when that is done.
define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
-; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; SI-SAFE: ; %bb.0: ; %.entry
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-SAFE-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; SI-SAFE-NEXT: v_fma_f32 v0, v0, 0, 0
-; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; SI-SAFE-NEXT: ; return to shader part epilog
-;
-; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; SI-NSZ: ; %bb.0: ; %.entry
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0
-; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
-; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
-; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; SI-NSZ-NEXT: ; return to shader part epilog
-;
-; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; VI-SAFE: ; %bb.0: ; %.entry
-; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
-; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
-; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
-; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
-; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-SAFE-NEXT: ; return to shader part epilog
-;
-; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; VI-NSZ: ; %bb.0: ; %.entry
-; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
-; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
-; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
-; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-NSZ-NEXT: ; return to shader part epilog
-;
-; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-SAFE: ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, 0, 0
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-SAFE-NEXT: ; return to shader part epilog
-;
-; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-NSZ: ; %bb.0: ; %.entry
-; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-NSZ-NEXT: ; return to shader part epilog
+; SI-LABEL: fneg_fadd_0_nsz_f16:
+; SI: ; %bb.0: ; %.entry
+; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_rcp_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: fneg_fadd_0_nsz_f16:
+; VI: ; %bb.0: ; %.entry
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mul_f16_e32 v0, 0x8000, v0
+; VI-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v1, 0x7e00
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, 0x8000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv afn half 1.000000e+00, %tmp6
%tmp8 = fmul half 0.000000e+00, %tmp7
@@ -4769,6 +4725,6 @@ declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
+attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
More information about the llvm-commits
mailing list