[llvm] [VectorCombine] Support multiple uses of shuffled ops (PR #173183)

Sun Dec 21 01:44:20 PST 2025

https://github.com/Xylecrack updated https://github.com/llvm/llvm-project/pull/173183

>From dfb77ee59b007312cff3e088c26416867f924936 Mon Sep 17 00:00:00 2001
From: Dhruva Narayan <dhruvakodiadka at gmail.com>
Date: Sun, 21 Dec 2025 14:42:17 +0530
Subject: [PATCH] [VectorCombine] Support multiple uses of shuffled ops

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 36 +++++---
 .../X86/shuffle-of-intrinsics-multiuse.ll     | 91 +++++++++++++++++++
 2 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9239cb1b989b2..2e23540b44d55 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -55,9 +55,9 @@ STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
 STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
 
-static cl::opt<bool> DisableVectorCombine(
-    "disable-vector-combine", cl::init(false), cl::Hidden,
-    cl::desc("Disable all vector combine transforms"));
+static cl::opt<bool>
+    DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden,
+                         cl::desc("Disable all vector combine transforms"));
 
 static cl::opt<bool> DisableBinopExtractShuffle(
     "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
@@ -1211,8 +1211,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
 
   // Determine scalar opcode
-  std::optional<unsigned> FunctionalOpcode =
-      VPI.getFunctionalOpcode();
+  std::optional<unsigned> FunctionalOpcode = VPI.getFunctionalOpcode();
   std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
   if (!FunctionalOpcode) {
     ScalarIntrID = VPI.getFunctionalIntrinsicID();
@@ -1235,8 +1234,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
       (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
   InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
 
-  LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI << "\n");
   LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
                     << ", Cost of scalarizing:" << NewCost << "\n");
 
@@ -2333,10 +2331,12 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
   }
 
   unsigned NumOpElts = Op0Ty->getNumElements();
-  bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
+  bool IsIdentity0 =
+      ShuffleDstTy == Op0Ty &&
       all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
-  bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
+  bool IsIdentity1 =
+      ShuffleDstTy == Op1Ty &&
       all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
       ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
 
@@ -3076,8 +3076,7 @@ bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
 bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   Value *V0, *V1;
   ArrayRef<int> OldMask;
-  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
-                           m_Mask(OldMask))))
+  if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
   auto *II0 = dyn_cast<IntrinsicInst>(V0);
@@ -3089,6 +3088,11 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   if (IID != II1->getIntrinsicID())
     return false;
 
+  InstructionCost CostII0 =
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
+  InstructionCost CostII1 =
+      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
+
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
   auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
   if (!ShuffleDstTy || !II0Ty)
@@ -3103,8 +3107,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
       return false;
 
   InstructionCost OldCost =
-      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
-      TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
+      CostII0 + CostII1 +
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
                          II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
 
@@ -3133,11 +3136,16 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
   NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
 
+  if (!II0->hasOneUse())
+    NewCost += CostII0;
+  if (II1 != II0 && !II1->hasOneUse())
+    NewCost += CostII1;
+
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
                     << "\n");
 
-  if (NewCost > OldCost)
+  if (NewCost >OldCost)
     return false;
 
   SmallVector<Value *> NewArgs;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll
new file mode 100644
index 0000000000000..4e370a974b953
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics-multiuse.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64 -S < %s | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v3 -S < %s | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <8 x i32> @test_multiuse_one_side(<4 x i32> %0, <4 x i32> %1) {
+; SSE-LABEL: define <8 x i32> @test_multiuse_one_side(
+; SSE-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:  [[ENTRY:.*:]]
+; SSE-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; SSE-NEXT:    [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; SSE-NEXT:    [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; SSE-NEXT:    ret <8 x i32> [[R]]
+;
+; AVX-LABEL: define <8 x i32> @test_multiuse_one_side(
+; AVX-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:  [[ENTRY:.*:]]
+; AVX-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; AVX-NEXT:    [[EXTRA_USE:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT:    [[RES:%.*]] = add i32 [[EXTRA_USE]], 1
+; AVX-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %extra_use = extractelement <4 x i32> %a, i32 0
+  %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %res = add i32 %extra_use, 1
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_multiuse_both_sides(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: define <8 x i32> @test_multiuse_both_sides(
+; CHECK-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; CHECK-NEXT:    [[B:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[UA:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[UB:%.*]] = extractelement <4 x i32> [[B]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %b = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
+  %ua = extractelement <4 x i32> %a, i32 0
+  %ub = extractelement <4 x i32> %b, i32 0
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_same_instruction_multi_use(<4 x i32> %0) {
+; SSE-LABEL: define <8 x i32> @test_same_instruction_multi_use(
+; SSE-SAME: <4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:  [[ENTRY:.*:]]
+; SSE-NEXT:    [[A:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0]], i1 false)
+; SSE-NEXT:    [[EXTRA:%.*]] = add <4 x i32> [[A]], [[A]]
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R]]
+;
+; AVX-LABEL: define <8 x i32> @test_same_instruction_multi_use(
+; AVX-SAME: <4 x i32> [[TMP0:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:  [[ENTRY:.*:]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
+  %extra = add <4 x i32> %a, %a
+  %r = shufflevector <4 x i32> %a, <4 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_shared_operands(<4 x i32> %0, <4 x i32> %1) {
+; CHECK-LABEL: define <8 x i32> @test_shared_operands(
+; CHECK-SAME: <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP2]])
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+entry:
+  %a = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %0)
+  %b = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> %1)
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}