[llvm] [VectorCombine] foldPermuteOfIntrinsic - support multiple uses of shuffled ops (PR #175299)
Julian Pokrovsky via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 10 00:55:29 PST 2026
https://github.com/raventid created https://github.com/llvm/llvm-project/pull/175299
Fixes https://github.com/llvm/llvm-project/issues/173039
>From 4c4a9b6cfc5b883a99d111cda3897538cda125e1 Mon Sep 17 00:00:00 2001
From: raventid <juliankul at gmail.com>
Date: Sat, 10 Jan 2026 16:44:58 +0800
Subject: [PATCH] [VectorCombine] foldPermuteOfIntrinsic - support multiple
uses of shuffled ops
Fixes https://github.com/llvm/llvm-project/issues/173039
---
.../Transforms/Vectorize/VectorCombine.cpp | 29 ++++++++++-------
.../VectorCombine/X86/shuffle-of-fma-const.ll | 32 +++++++++++++++++++
2 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c9e45a8d05d78..fdab1813e9f23 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -55,9 +55,9 @@ STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
-static cl::opt<bool> DisableVectorCombine(
- "disable-vector-combine", cl::init(false), cl::Hidden,
- cl::desc("Disable all vector combine transforms"));
+static cl::opt<bool>
+ DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden,
+ cl::desc("Disable all vector combine transforms"));
static cl::opt<bool> DisableBinopExtractShuffle(
"disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
@@ -1211,8 +1211,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
// Determine scalar opcode
- std::optional<unsigned> FunctionalOpcode =
- VPI.getFunctionalOpcode();
+ std::optional<unsigned> FunctionalOpcode = VPI.getFunctionalOpcode();
std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
if (!FunctionalOpcode) {
ScalarIntrID = VPI.getFunctionalIntrinsicID();
@@ -1235,8 +1234,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
(SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
- LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI << "\n");
LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
<< ", Cost of scalarizing:" << NewCost << "\n");
@@ -2330,10 +2328,12 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
unsigned NumOpElts = Op0Ty->getNumElements();
- bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
+ bool IsIdentity0 =
+ ShuffleDstTy == Op0Ty &&
all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
- bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
+ bool IsIdentity1 =
+ ShuffleDstTy == Op1Ty &&
all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
@@ -3204,7 +3204,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
Value *V0;
ArrayRef<int> Mask;
- if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
+ if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
return false;
auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3226,8 +3226,10 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
return false;
// Cost analysis
+ InstructionCost IntrinsicCost =
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
InstructionCost OldCost =
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
+ IntrinsicCost +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleDstTy,
IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
@@ -3249,6 +3251,11 @@ bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
+ // If the intrinsic has multiple uses, we need to account for the cost of
+ // keeping the original intrinsic around.
+ if (!II0->hasOneUse())
+ NewCost += IntrinsicCost;
+
LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
<< OldCost << " vs NewCost: " << NewCost << "\n");
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
index ff810b615bac9..78879d4903a97 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-fma-const.ll
@@ -52,3 +52,35 @@ define <8 x float> @interleave_fma_const_chain(<4 x float> %a0, <4 x float> %a1)
%res = shufflevector <4 x float> %l, <4 x float> %h, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
ret <8 x float> %res
}
+
+; Negative test - multiple uses make the transformation unprofitable
+define <4 x float> @shuffle_fma_const_chain_multiuse(<4 x float> %a0, ptr %p) {
+; CHECK-LABEL: define <4 x float> @shuffle_fma_const_chain_multiuse(
+; CHECK-SAME: <4 x float> [[A0:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x float> [[F]], ptr [[P]], align 16
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+ %res = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ store <4 x float> %f, ptr %p, align 16
+ ret <4 x float> %res
+}
+
+; Negative test - intrinsic used by shuffle and arithmetic
+define <4 x float> @shuffle_fma_multiuse_with_arith(<4 x float> %a0, <4 x float> %b) {
+; CHECK-LABEL: define <4 x float> @shuffle_fma_multiuse_with_arith(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[F:%.*]] = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[F]], [[B]]
+; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[SHUF]], [[ADD]]
+; CHECK-NEXT: ret <4 x float> [[RES]]
+;
+ %f = tail call noundef <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
+ %shuf = shufflevector <4 x float> %f, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %add = fadd <4 x float> %f, %b
+ %res = fadd <4 x float> %shuf, %add
+ ret <4 x float> %res
+}
More information about the llvm-commits
mailing list