[llvm] [VectorCombine] foldPermuteOfIntrinsic - support multiple uses of shuffled ops (PR #175299)

Sat Jan 10 00:55:29 PST 2026

https://github.com/raventid created https://github.com/llvm/llvm-project/pull/175299

Fixes https://github.com/llvm/llvm-project/issues/173039



>From 4c4a9b6cfc5b883a99d111cda3897538cda125e1 Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Sat, 10 Jan 2026 16:44:58 +0800
Subject: [PATCH] [VectorCombine] foldPermuteOfIntrinsic - support multiple
 uses of shuffled ops

Fixes https://github.com/llvm/llvm-project/issues/173039
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 29 ++++++++++-------
 .../VectorCombine/X86/shuffle-of-fma-const.ll | 32 +++++++++++++++++++
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c9e45a8d05d78..fdab1813e9f23 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -55,9 +55,9 @@ STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
 STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
 
-static cl::opt<bool> DisableVectorCombine(
-    "disable-vector-combine", cl::init(false), cl::Hidden,
-    cl::desc("Disable all vector combine transforms"));
+static cl::opt<bool>
+    DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden,
+                         cl::desc("Disable all vector combine transforms"));
 
 static cl::opt<bool> DisableBinopExtractShuffle(
     "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
@@ -1211,8 +1211,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
 
   // Determine scalar opcode
-  std::optional<unsigned> FunctionalOpcode =
-      VPI.getFunctionalOpcode();
+  std::optional<unsigned> FunctionalOpcode = VPI.getFunctionalOpcode();
   std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
   if (!FunctionalOpcode) {
     ScalarIntrID = VPI.getFunctionalIntrinsicID();
@@ -1235,8 +1234,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
       (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
   InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
 
-  LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI << "\n");
   LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
                     << ", Cost of scalarizing:" << NewCost << "\n");
 
@@ -2330,10 +2328,12 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   }
 
   unsigned NumOpElts = Op0Ty->getNumElements();
-  bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
+  bool IsIdentity0 =
+      ShuffleDstTy == Op0Ty &&
       all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
-  bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
+  bool IsIdentity1 =
+      ShuffleDstTy == Op1Ty &&
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
 
@@ -3204,7 +3204,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
 bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
   Value *V0;
   ArrayRef<int> Mask;
-  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
+  if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
     return false;
 
   auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3226,8 +3226,10 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
     return false;
 
   // Cost analysis
+  InstructionCost IntrinsicCost =
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
   InstructionCost OldCost =
-      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
+      IntrinsicCost +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
                          IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
 
@@ -3249,6 +3251,11 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
   IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
   NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
 
+  // If the intrinsic has multiple uses, we need to account for the cost of
+  // keeping the original intrinsic around.
+  if (!II0->hasOneUse())
+    NewCost += IntrinsicCost;
+
   LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n  OldCost: "
                     << OldCost << " vs NewCost: " << NewCost << "\n");
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
index ff810b615bac9..78879d4903a97 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
@@ -52,3 +52,35 @@ define <8 x float> @interleave_fma_const_chain(<4 x float> %a0, <4 x float> %a1)
   %res = shufflevector <4 x float> %l, <4 x float> %h, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x float> %res
 }
+
+; Negative test - multiple uses make the transformation unprofitable
+define <4 x float> @shuffle_fma_const_chain_multiuse(<4 x float> %a0, ptr %p) {
+; CHECK-LABEL: define <4 x float> @shuffle_fma_const_chain_multiuse(
+; CHECK-SAME: <4 x float> [[A0:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x float> [[F]], ptr [[P]], align 16
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %res = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x float> %f, ptr %p, align 16
+  ret <4 x float> %res
+}
+
+; Negative test - intrinsic used by shuffle and arithmetic
+define <4 x float> @shuffle_fma_multiuse_with_arith(<4 x float> %a0, <4 x float> %b) {
+; CHECK-LABEL: define <4 x float> @shuffle_fma_multiuse_with_arith(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[F]], [[B]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[SHUF]], [[ADD]]
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+  %shuf = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add = fadd <4 x float> %f, %b
+  %res = fadd <4 x float> %shuf, %add
+  ret <4 x float> %res
+}