[llvm] [VectorCombine] Support multiple uses of shuffled ops (PR #173183)
Dhruva Narayan K via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 21 01:44:20 PST 2025
https://github.com/Xylecrack updated https://github.com/llvm/llvm-project/pull/173183
>From dfb77ee59b007312cff3e088c26416867f924936 Mon Sep 17 00:00:00 2001
From: Dhruva Narayan <dhruvakodiadka at gmail.com>
Date: Sun, 21 Dec 2025 14:42:17 +0530
Subject: [PATCH] [VectorCombine] Support multiple uses of shuffled ops
---
.../Transforms/Vectorize/VectorCombine.cpp | 36 +++++---
.../X86/shuffle-of-intrinsics-multiuse.ll | 91 +++++++++++++++++++
2 files changed, 113 insertions(+), 14 deletions(-)
create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9239cb1b989b2..2e23540b44d55 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -55,9 +55,9 @@ STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");
STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
-static cl::opt<bool> DisableVectorCombine(
- "disable-vector-combine", cl::init(false), cl::Hidden,
- cl::desc("Disable all vector combine transforms"));
+static cl::opt<bool>
+ DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden,
+ cl::desc("Disable all vector combine transforms"));
static cl::opt<bool> DisableBinopExtractShuffle(
"disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
@@ -1211,8 +1211,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
// Determine scalar opcode
- std::optional<unsigned> FunctionalOpcode =
- VPI.getFunctionalOpcode();
+ std::optional<unsigned> FunctionalOpcode = VPI.getFunctionalOpcode();
std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
if (!FunctionalOpcode) {
ScalarIntrID = VPI.getFunctionalIntrinsicID();
@@ -1235,8 +1234,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
(SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
- LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
- << "\n");
+ LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI << "\n");
LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
<< ", Cost of scalarizing:" << NewCost << "\n");
@@ -2333,10 +2331,12 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
unsigned NumOpElts = Op0Ty->getNumElements();
- bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
+ bool IsIdentity0 =
+ ShuffleDstTy == Op0Ty &&
all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
- bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
+ bool IsIdentity1 =
+ ShuffleDstTy == Op1Ty &&
all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
@@ -3076,8 +3076,7 @@ bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
Value *V0, *V1;
ArrayRef<int> OldMask;
- if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
- m_Mask(OldMask))))
+ if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
return false;
auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3089,6 +3088,11 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
if (IID != II1->getIntrinsicID())
return false;
+ InstructionCost CostII0 =
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
+ InstructionCost CostII1 =
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
+
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
if (!ShuffleDstTy || !II0Ty)
@@ -3103,8 +3107,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
return false;
InstructionCost OldCost =
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
+ CostII0 + CostII1 +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
@@ -3133,11 +3136,16 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
+ if (!II0->hasOneUse())
+ NewCost += CostII0;
+ if (II1 != II0 && !II1->hasOneUse())
+ NewCost += CostII1;
+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
- if (NewCost > OldCost)
+ if (NewCost >OldCost)
return false;
SmallVector<Value *> NewArgs;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll
new file mode 100644
index 0000000000000..4e370a974b953
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64 -S < %s | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v3 -S < %s | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <8 x i32> @test_multiuse_one_side(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: define <8 x i32> @test_multiuse_one_side(
+; SSE-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT: [[ENTRY:.*:]]
+; SSE-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; SSE-NEXT: [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; SSE-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; SSE-NEXT: ret <8 x i32> [[R]]
+;
+; AVX-LABEL: define <8 x i32> @test_multiuse_one_side(
+; AVX-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT: [[ENTRY:.*:]]
+; AVX-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; AVX-NEXT: [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT: [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT: [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; AVX-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %extra_use = extractelement <4 x i32> %a, i32 0
+ %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = add i32 %extra_use, 1
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_multiuse_both_sides(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: define <8 x i32> @test_multiuse_both_sides(
+; CHECK-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; CHECK-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; CHECK-NEXT: [[UA:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT: [[UB:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+ %ua = extractelement <4 x i32> %a, i32 0
+ %ub = extractelement <4 x i32> %b, i32 0
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_same_instruction_multi_use(<4 x i32> %0) {
+; SSE-LABEL: define <8 x i32> @test_same_instruction_multi_use(
+; SSE-SAME: <4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[ENTRY:.*:]]
+; SSE-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; SSE-NEXT: [[EXTRA:%.*]] = add <4 x i32> [[A]], [[A]]
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x i32> [[R]]
+;
+; AVX-LABEL: define <8 x i32> @test_same_instruction_multi_use(
+; AVX-SAME: <4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[ENTRY:.*:]]
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT: [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %extra = add <4 x i32> %a, %a
+ %r = shufflevector <4 x i32> %a, <4 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_shared_operands(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: define <8 x i32> @test_shared_operands(
+; CHECK-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP2]])
+; CHECK-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %0)
+ %b = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> %1)
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
More information about the llvm-commits
mailing list