[llvm] [SLP] Guard FMulAdd conversion to require single-use/non-reordered FMul operands (PR #189692)

Tue Mar 31 08:24:33 PDT 2026

https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/189692

The FMulAdd (CombinedVectorize) transformation in transformNodes() marks
an FMul child entry with zero cost, assuming it is fully absorbed into
the fmuladd intrinsic. However, when any FMul scalar has multiple uses
(e.g., also stored separately), the FMul must survive as a separate
node.


>From 1f7a67f32fed55ca7d2aa3dd3124cc040c2ccb9b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 31 Mar 2026 08:23:54 -0700
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 16 +++++++++++++++
 .../AArch64/fma-conversion-multi-use-guard.ll | 20 ++++++-------------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e48b68fafd806..b3fbfc5b3ee93 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14351,6 +14351,22 @@ void BoUpSLP::transformNodes() {
       if (E.State != TreeEntry::Vectorize ||
           !E.getOperations().isAddSubLikeOp())
         break;
+      const TreeEntry *LHS = getOperandEntry(&E, 0);
+      const TreeEntry *RHS = getOperandEntry(&E, 1);
+      auto IsOneUseVectorFMulOperand = [](const TreeEntry *TE) {
+        return TE->State == TreeEntry::Vectorize &&
+               TE->ReorderIndices.empty() && TE->ReuseShuffleIndices.empty() &&
+               TE->getOpcode() == Instruction::FMul && !TE->isAltShuffle() &&
+               all_of(TE->Scalars, [&](Value *V) {
+                 return (TE->hasCopyableElements() &&
+                         TE->isCopyableElement(V)) ||
+                        V->hasOneUse();
+               });
+      };
+      if (!IsOneUseVectorFMulOperand(LHS) &&
+          (E.getOpcode() == Instruction::Sub ||
+           !IsOneUseVectorFMulOperand(RHS)))
+        break;
       if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
                .isValid())
         break;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fma-conversion-multi-use-guard.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fma-conversion-multi-use-guard.ll
index 624c24a99859f..2acbebf259ee9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/fma-conversion-multi-use-guard.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fma-conversion-multi-use-guard.ll
@@ -47,20 +47,12 @@ entry:
 define void @mixed_use_v4(ptr %src0, ptr %src1, ptr %dst, ptr %extra) {
 ; CHECK-LABEL: @mixed_use_v4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_S0_2:%.*]] = getelementptr inbounds float, ptr [[SRC0:%.*]], i64 2
-; CHECK-NEXT:    [[GEP_S1_2:%.*]] = getelementptr inbounds float, ptr [[SRC1:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[SRC1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], splat (float 1.000000e+00)
-; CHECK-NEXT:    [[GEP_D_2:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[GEP_S0_2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[GEP_S1_2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x float> [[TMP6]], splat (float 1.000000e+00)
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
-; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[GEP_D_2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC0:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[SRC1:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], splat (float 1.000000e+00)
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    store float [[TMP4]], ptr [[EXTRA:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;