[llvm] 1235409 - [VectorCombine] foldShuffleOfIntrinsics - support multiple uses of shuffled ops (#173183)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 22 11:00:57 PST 2025
Author: Dhruva Narayan K
Date: 2025-12-22T19:00:53Z
New Revision: 1235409ed7022aa02bfeb86c08423225dfd460b9
URL: https://github.com/llvm/llvm-project/commit/1235409ed7022aa02bfeb86c08423225dfd460b9
DIFF: https://github.com/llvm/llvm-project/commit/1235409ed7022aa02bfeb86c08423225dfd460b9.diff
LOG: [VectorCombine] foldShuffleOfIntrinsics - support multiple uses of shuffled ops (#173183)
Fixes #173037
Remove the `m_OneUse` restriction in `foldShuffleOfIntrinsics` and
update the cost model to account for additional uses of the original intrinsics.
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index eda7f0bac2f2e..265f1fd22a13a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3086,8 +3086,7 @@ bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
Value *V0, *V1;
ArrayRef<int> OldMask;
- if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
- m_Mask(OldMask))))
+ if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
return false;
auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3098,6 +3097,10 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
Intrinsic::ID IID = II0->getIntrinsicID();
if (IID != II1->getIntrinsicID())
return false;
+ InstructionCost CostII0 =
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
+ InstructionCost CostII1 =
+ TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
@@ -3113,8 +3116,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
return false;
InstructionCost OldCost =
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
- TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
+ CostII0 + CostII1 +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
@@ -3141,7 +3143,12 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
}
}
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
+
NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
+ if (!II0->hasOneUse())
+ NewCost += CostII0;
+ if (II1 != II0 && !II1->hasOneUse())
+ NewCost += CostII1;
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index a949d36b8b8ce..1596614ef9584 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -151,3 +151,85 @@ declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+
+define <8 x i32> @test_multiuse_one_side(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: @test_multiuse_one_side(
+; SSE-NEXT: entry:
+; SSE-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; SSE-NEXT: [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; SSE-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; SSE-NEXT: ret <8 x i32> [[R]]
+;
+; AVX-LABEL: @test_multiuse_one_side(
+; AVX-NEXT: entry:
+; AVX-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; AVX-NEXT: [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT: [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT: [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; AVX-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %extra_use = extractelement <4 x i32> %a, i32 0
+ %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = add i32 %extra_use, 1
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_multiuse_both_sides(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test_multiuse_both_sides(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; CHECK-NEXT: [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; CHECK-NEXT: [[UA:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT: [[UB:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+ %ua = extractelement <4 x i32> %a, i32 0
+ %ub = extractelement <4 x i32> %b, i32 0
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_same_instruction_multi_use(<4 x i32> %0) {
+; SSE-LABEL: @test_same_instruction_multi_use(
+; SSE-NEXT: entry:
+; SSE-NEXT: [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; SSE-NEXT: [[EXTRA:%.*]] = add <4 x i32> [[A]], [[A]]
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x i32> [[R]]
+;
+; AVX-LABEL: @test_same_instruction_multi_use(
+; AVX-NEXT: entry:
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT: [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+ %extra = add <4 x i32> %a, %a
+ %r = shufflevector <4 x i32> %a, <4 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @test_shared_operands(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test_shared_operands(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP2]])
+; CHECK-NEXT: ret <8 x i32> [[R]]
+;
+entry:
+ %a = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %0)
+ %b = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> %1)
+ %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %r
+}
More information about the llvm-commits
mailing list