[llvm] 1235409 - [VectorCombine] foldShuffleOfIntrinsics - support multiple uses of shuffled ops (#173183)

Mon Dec 22 11:00:57 PST 2025

Author: Dhruva Narayan K
Date: 2025-12-22T19:00:53Z
New Revision: 1235409ed7022aa02bfeb86c08423225dfd460b9

URL: https://github.com/llvm/llvm-project/commit/1235409ed7022aa02bfeb86c08423225dfd460b9
DIFF: https://github.com/llvm/llvm-project/commit/1235409ed7022aa02bfeb86c08423225dfd460b9.diff

LOG: [VectorCombine] foldShuffleOfIntrinsics - support multiple uses of shuffled ops (#173183)

Fixes #173037

Remove the `m_OneUse` restriction in `foldShuffleOfIntrinsics` and
update the cost model to account for additional uses of the original intrinsics.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index eda7f0bac2f2e..265f1fd22a13a 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3086,8 +3086,7 @@ bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
 bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   Value *V0, *V1;
   ArrayRef<int> OldMask;
-  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
-                           m_Mask(OldMask))))
+  if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
   auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3098,6 +3097,10 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   Intrinsic::ID IID = II0->getIntrinsicID();
   if (IID != II1->getIntrinsicID())
     return false;
+  InstructionCost CostII0 =
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
+  InstructionCost CostII1 =
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
 
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
   auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
@@ -3113,8 +3116,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
       return false;
 
   InstructionCost OldCost =
-      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
-      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
+      CostII0 + CostII1 +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
                          II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
 
@@ -3141,7 +3143,12 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
     }
   }
   IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
+
   NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
+  if (!II0->hasOneUse())
+    NewCost += CostII0;
+  if (II1 != II0 && !II1->hasOneUse())
+    NewCost += CostII1;
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index a949d36b8b8ce..1596614ef9584 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -151,3 +151,85 @@ declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
 declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+
+define <8 x i32> @test_multiuse_one_side(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: @test_multiuse_one_side(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; SSE-NEXT:    [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; SSE-NEXT:    [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; SSE-NEXT:    ret <8 x i32> [[R]]
+;
+; AVX-LABEL: @test_multiuse_one_side(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; AVX-NEXT:    [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT:    [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; AVX-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %extra_use = extractelement <4 x i32> %a, i32 0
+  %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res = add i32 %extra_use, 1
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_multiuse_both_sides(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test_multiuse_both_sides(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; CHECK-NEXT:    [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
+; CHECK-NEXT:    [[UA:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[UB:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %ua = extractelement <4 x i32> %a, i32 0
+  %ub = extractelement <4 x i32> %b, i32 0
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_same_instruction_multi_use(<4 x i32> %0) {
+; SSE-LABEL: @test_same_instruction_multi_use(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 false)
+; SSE-NEXT:    [[EXTRA:%.*]] = add <4 x i32> [[A]], [[A]]
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R]]
+;
+; AVX-LABEL: @test_same_instruction_multi_use(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %extra = add <4 x i32> %a, %a
+  %r = shufflevector <4 x i32> %a, <4 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_shared_operands(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: @test_shared_operands(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %0)
+  %b = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> %1)
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}