[llvm] [VectorCombine] Prevent redundant cost computation for repeated operand pairs in foldShuffleOfIntrinsics (PR #171965)

Fri Dec 12 05:51:13 PST 2025

https://github.com/Bhuvan1527 updated https://github.com/llvm/llvm-project/pull/171965

>From 66bf918b52b63cee090003eddce0ba8dd89cb893 Mon Sep 17 00:00:00 2001
From: bhuvan1527 <balabhuvanvarma at gmail.com>
Date: Fri, 12 Dec 2025 09:55:23 +0530
Subject: [PATCH 1/2] [VectorCombine] Prevent redundant cost computation for
 repeated operand pairs in foldShuffleOfIntrinsics

This pr resolves [#170867](https://github.com/llvm/llvm-project/issues/170867)
Existing code recomputes the cost for creating a shuffle instruction even for the
repeating Intrinsic operand pairs. This will result in higher newCost. Hence
the runtime will decide not to fold.

The change proposed in this pr will address this issue. When calculating the newCost
we are skipping the cost calculation of an operand pair if it was already considered.
And when creating the transformed code, we are reusing the already created shuffle instruction
for repeated operand pair.
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 243f685cf25e2..df0efac24ed57 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2917,6 +2917,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
 
   SmallVector<Type *> NewArgsTy;
   InstructionCost NewCost = 0;
+  SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
   for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
     if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
       NewArgsTy.push_back(II0->getArgOperand(I)->getType());
@@ -2925,6 +2926,12 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
       auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
                                          ShuffleDstTy->getNumElements());
       NewArgsTy.push_back(ArgTy);
+      std::pair<Value *, Value *> OperandPair =
+          std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
+      if (!SeenOperandPairs.insert(OperandPair).second) {
+        // We've already computed the cost for this operand pair.
+        continue;
+      }
       NewCost += TTI.getShuffleCost(
           TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
           CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
@@ -2941,12 +2948,22 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
     return false;
 
   SmallVector<Value *> NewArgs;
+  SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
   for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
     if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
       NewArgs.push_back(II0->getArgOperand(I));
     } else {
+      std::pair<Value *, Value *> OperandPair =
+          std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
+      auto It = ShuffleCache.find(OperandPair);
+      if (It != ShuffleCache.end()) {
+        // Reuse previously created shuffle for this operand pair.
+        NewArgs.push_back(It->second);
+        continue;
+      }
       Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
                                                 II1->getArgOperand(I), OldMask);
+      ShuffleCache[OperandPair] = Shuf;
       NewArgs.push_back(Shuf);
       Worklist.pushValue(Shuf);
     }

>From 9bf6b4d51a83288baff134bcdbe7c140bab21ff1 Mon Sep 17 00:00:00 2001
From: bhuvan1527 <balabhuvanvarma at gmail.com>
Date: Fri, 12 Dec 2025 19:18:58 +0530
Subject: [PATCH 2/2] [VectorCombine] Prevent redundant cost computation for
 repeated operand pairs in foldShuffleOfIntrinsics

Added a relevant test llvm function to verify the working.
---
 .../X86/shuffle-of-intrinsics.ll              | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index e64e7807b7d4a..a949d36b8b8ce 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -118,8 +118,36 @@ define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4
   ret <2 x float> %s
 }
 
+define <8 x float> @test7(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1) {
+; SSE-LABEL: @test7(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> [[Y1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X1]], <4 x float> [[Y1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[Y0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[Y0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x float> zeroinitializer)
+; SSE-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP5]])
+; SSE-NEXT:    ret <8 x float> [[RES]]
+;
+; AVX-LABEL: @test7(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> [[Y1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[Y0:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP2]], <8 x float> zeroinitializer)
+; AVX-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX-NEXT:    ret <8 x float> [[RES]]
+;
+  %l0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x0, <4 x float> zeroinitializer)
+  %l1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x1, <4 x float> %x1, <4 x float> %l0)
+  %h0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y0, <4 x float> %y0, <4 x float> zeroinitializer)
+  %h1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y1, <4 x float> %y1, <4 x float> %h0)
+  %res = shufflevector <4 x float> %l1, <4 x float> %h1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %res
+}
+
+
 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
 declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
 declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
 declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
 declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)