[llvm] [AMDGPU] Make fast-fmaf an optional flag, defaulting to True for GFX9 (PR #161450)

Wed Oct 1 19:01:42 PDT 2025

https://github.com/yiqian1 updated https://github.com/llvm/llvm-project/pull/161450

>From aab5e41bd8374daed3563503aed35acfb95fb5af Mon Sep 17 00:00:00 2001
From: Yi Qian <yi.qian at amd.com>
Date: Tue, 30 Sep 2025 20:34:15 +0000
Subject: [PATCH] [AMDGPU] Add a target option to disable aggressive FMA fusion

---
 llvm/lib/Target/AMDGPU/AMDGPU.td                     | 7 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h             | 5 +++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp              | 4 ++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp            | 8 ++++++--
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..5a08e7d6db347 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1238,6 +1238,13 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst",
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
+def FeatureDisableAggressiveFMAFusion : SubtargetFeature<
+  "disable-aggressive-fma-fusion",
+  "DisableAggressiveFMAFusion",
+  "true",
+  "Do not fold fmul and fadd/fsub into fma."
+>;
+
 // Ugly hack to accomodate assembling modules with mixed
 // wavesizes. Ideally we would have a mapping symbol in assembly which
 // would keep track of which sections of code should be treated as
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index ed03ef21b6dda..0c380a7e4dc84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -76,6 +76,7 @@ class AMDGPUSubtarget {
   bool EnablePromoteAlloca = false;
   bool HasTrigReducedRange = false;
   bool FastFMAF32 = false;
+  bool DisableAggressiveFMAFusion = false;
   unsigned EUsPerCU = 4;
   unsigned MaxWavesPerEU = 10;
   unsigned LocalMemorySize = 0;
@@ -303,6 +304,10 @@ class AMDGPUSubtarget {
     return FastFMAF32;
   }
 
+  bool hasDisableAggressiveFMAFusion() const {
+    return DisableAggressiveFMAFusion;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..554549063dbcc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,6 +288,7 @@ const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
     AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
     AMDGPU::FeatureUnalignedAccessMode,
+    AMDGPU::FeatureDisableAggressiveFMAFusion,
 
     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b94ea3ffbf1f..b7473e5ea4759 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -85,6 +85,10 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
       FullFS += "-wavefrontsize64,";
   }
 
+  // GFX9 enables fast-fmaf by default
+  if (GPU.contains_insensitive("gfx9") && !FS.contains_insensitive("fast-fmaf"))
+    FullFS += "+fast-fmaf";
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 16530087444d2..59fcf9fb6da39 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6502,10 +6502,14 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   // When fma is quarter rate, for f64 where add / sub are at best half rate,
   // most of these combines appear to be cycle neutral but save on instruction
   // count / code size.
-  return true;
+  return Subtarget->hasFastFMAF32() &&
+         !Subtarget->hasDisableAggressiveFMAFusion();
 }
 
-bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
+bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const {
+  return Subtarget->hasFastFMAF32() &&
+         !Subtarget->hasDisableAggressiveFMAFusion();
+}
 
 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                                          EVT VT) const {