[llvm-branch-commits] [llvm] backport: [SLP]Improve reductions for copyables/split nodes (PR #187659)

Fri Mar 20 01:19:17 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-vectorizers

Author: Pengcheng Wang (wangpc-pp)

<details>
<summary>Changes</summary>

The original support for copyables leads to a regression in x264 in RISCV, this patch improves detection of the copyable candidates by more precise checking of the profitability and adds and extra check for splitnode reduction, if it is profitable.

Fixes #184313

Reviewers: hiraditya, RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/185697

---

Patch is 23.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/187659.diff


7 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+58-28) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll (+8-9) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll (+8-9) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll (+16-20) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll (+28-23) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll (+2-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll (+4-5) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a717043508132..0a6b2545bc461 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3693,6 +3693,18 @@ class slpvectorizer::BoUpSLP {
     return ScalarToTreeEntries.contains(V);
   }
 
+  /// Checks if it is legal and profitable to build SplitVectorize node for the
+  /// given \p VL.
+  /// \param Op1 first homogeneous scalars.
+  /// \param Op2 second homogeneous scalars.
+  /// \param ReorderIndices indices to reorder the scalars.
+  /// \returns true if the node was successfully built.
+  bool canBuildSplitNode(ArrayRef<Value *> VL,
+                         const InstructionsState &LocalState,
+                         SmallVectorImpl<Value *> &Op1,
+                         SmallVectorImpl<Value *> &Op2,
+                         OrdersType &ReorderIndices) const;
+
   ~BoUpSLP();
 
 private:
@@ -3753,18 +3765,6 @@ class slpvectorizer::BoUpSLP {
                                ArrayRef<Value *> VectorizedVals,
                                SmallPtrSetImpl<Value *> &CheckedExtracts);
 
-  /// Checks if it is legal and profitable to build SplitVectorize node for the
-  /// given \p VL.
-  /// \param Op1 first homogeneous scalars.
-  /// \param Op2 second homogeneous scalars.
-  /// \param ReorderIndices indices to reorder the scalars.
-  /// \returns true if the node was successfully built.
-  bool canBuildSplitNode(ArrayRef<Value *> VL,
-                         const InstructionsState &LocalState,
-                         SmallVectorImpl<Value *> &Op1,
-                         SmallVectorImpl<Value *> &Op2,
-                         OrdersType &ReorderIndices) const;
-
   /// This is the recursive part of buildTree.
   void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
                     unsigned InterleaveFactor = 0);
@@ -25087,6 +25087,11 @@ class HorizontalReduction {
     SmallVector<SmallVector<Value *>> LocalReducedVals;
     // Try merge consecutive reduced values into a single vectorizable group and
     // check, if they can be vectorized as copyables.
+    const bool TwoGroupsOnly = ReducedVals.size() == 2;
+    const bool TwoGroupsOfSameSmallSize =
+        TwoGroupsOnly &&
+        ReducedVals.front().size() == ReducedVals.back().size() &&
+        ReducedVals.front().size() < ReductionLimit;
     for (ArrayRef<Value *> RV : ReducedVals) {
       // Loads are not very compatible with undefs.
       if (isa<UndefValue>(RV.front()) &&
@@ -25103,22 +25108,47 @@ class HorizontalReduction {
         States.push_back(getSameOpcode(RV, TLI));
         continue;
       }
-      SmallVector<Value *> Ops;
-      if (!LocalReducedVals.empty())
-        Ops = LocalReducedVals.back();
-      Ops.append(RV.begin(), RV.end());
-      InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
-      InstructionsState OpS =
-          Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
-      if (LocalReducedVals.empty()) {
-        LocalReducedVals.push_back(Ops);
-        States.push_back(OpS);
-        continue;
-      }
-      if (OpS) {
-        LocalReducedVals.back().swap(Ops);
-        States.back() = OpS;
-        continue;
+      // Do some copyables analysis only if more than 2 groups exist or they
+      // are large enough.
+      if (!TwoGroupsOfSameSmallSize) {
+        SmallVector<Value *> Ops;
+        if (!LocalReducedVals.empty())
+          Ops = LocalReducedVals.back();
+        Ops.append(RV.begin(), RV.end());
+        InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
+        InstructionsState OpS = Analysis.buildInstructionsState(
+            Ops, V, /*TryCopyableElementsVectorization=*/true,
+            /*WithProfitabilityCheck=*/true, /*SkipSameCodeCheck=*/true);
+        if (OpS && OpS.areInstructionsWithCopyableElements()) {
+          if (LocalReducedVals.empty()) {
+            LocalReducedVals.push_back(Ops);
+            States.push_back(OpS);
+            continue;
+          }
+          LocalReducedVals.back().swap(Ops);
+          States.back() = OpS;
+          continue;
+        }
+        // For safety, allow split vectorization only if 2 groups are available
+        // overall.
+        if (TwoGroupsOnly) {
+          auto [MainOp, AltOp] = getMainAltOpsNoStateVL(Ops);
+          OpS = InstructionsState(MainOp, AltOp);
+          // Last chance to try to vectorize alternate node.
+          SmallVector<Value *> Op1, Op2;
+          BoUpSLP::OrdersType ReorderIndices;
+          if (MainOp && AltOp &&
+              V.canBuildSplitNode(Ops, OpS, Op1, Op2, ReorderIndices)) {
+            if (LocalReducedVals.empty()) {
+              LocalReducedVals.push_back(Ops);
+              States.push_back(OpS);
+              continue;
+            }
+            LocalReducedVals.back().swap(Ops);
+            States.back() = OpS;
+            continue;
+          }
+        }
       }
       LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
       States.push_back(getSameOpcode(RV, TLI));
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
index a2ccbb96b6003..d36da8d028c60 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll
@@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
index eb9b249b9a898..c3131a41c2b2e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll
@@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w,
 ; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53
 ; CHECK-NEXT:    [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820
 ; CHECK-NEXT:    [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2
-; CHECK-NEXT:    [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]]
-; CHECK-NEXT:    [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]]
-; CHECK-NEXT:    [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]]
-; CHECK-NEXT:    [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]]
-; CHECK-NEXT:    [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]]
-; CHECK-NEXT:    [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]]
-; CHECK-NEXT:    [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]]
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]]
+; CHECK-NEXT:    [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]]
+; CHECK-NEXT:    [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]]
+; CHECK-NEXT:    [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]]
+; CHECK-NEXT:    [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index e3279290c3074..dbceefdb2e986 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,38 +5,34 @@
 define void @Test(i32) {
 ; CHECK-LABEL: @Test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
-; CHECK-NEXT:    [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
+; CHECK-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
-; CHECK-NEXT:    [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
 ; CHECK-NEXT:    [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
+; CHECK-NEXT:    [[TMP10]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_44]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
-; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
-; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
-; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
+; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
-; FORCE_REDUCTION-NEXT:    [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
+; FORCE_REDUCTION-NEXT:    [[VAL_43:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_43]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_44]], i32 1
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll b/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll
index 410d67bb4c403..f83782aef123c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll
@@ -5,45 +5,50 @@ define void @test(i32 %arg, i32 %arg1, i64 %arg2) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[ARG:%.*]], i32 [[ARG1:%.*]], i64 [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[BB:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG1]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> <i64 1, i64 poison>, i64 [[ARG2]], i32 1
 ; CHECK-NEXT:    br label %[[BB3:.*]]
 ; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[BB3]] ], [ 0, %[[BB]] ]
-; CHECK-NEXT:    [[PHI4:%.*]] = phi i64 [ [[OP_RDX8:%.*]], %[[BB3]] ], [ 0, %[[BB]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[PHI4:%.*]] = phi i64 [ [[OP_RDX3:%.*]], %[[BB3]] ], [ 0, %[[BB]] ]
+; CHECK-NEXT:    [[ADD18:%.*]] = add i64 1, 0
+; CHECK-NEXT:    [[TRUNC19:%.*]] = trunc i64 [[ADD18]] to i32
+; CHECK-NEXT:    [[TRUNC27:%.*]] = trunc i64 [[ARG2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> <i64 poison, i64 0, i64 poison, i64 0>, i64 [[PHI]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; CHECK-NEXT:    [[TRUNC10:%.*]] = trunc i64 [[TMP10]] to i32
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[TRUNC10]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> zeroinitializer, [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul <4 x i64> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[SHL34:%.*]] = xor i32 [[ARG]], [[TRUNC27]]
+; CHECK-NEXT:    [[SHL35:%.*]] = shl i32 [[ARG1]], 0
+; CHECK-NEXT:    [[XOR31:%.*]] = xor i32 [[ARG1]], [[TRUNC19]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 0, 1
+; CHECK-NEXT:    [[TMP31:%.*]] = sext i32 [[SHL]] to i64
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 0, [[TRUNC]]
+; CHECK-NEXT:    [[SEXT15:%.*]] = sext i32 [[XOR]] to i64
 ; CHECK-NEXT:    [[SHL17:%.*]] = shl i32 [[SHL]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <4 x i64> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TRUNC10:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[TRUNC10]], 0
-; CHECK-NEXT:    [[SHL34:%.*]] = shl i32 [[ARG1]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <4 x i32> [[TMP2]], [[TMP18]]
+; CHECK-NEXT:    [[SEXT22:%.*]] = sext i32 [[SHL17]] to i64
+; CHECK-NEXT:    [[XOR24:%.*]] = xor i32 [[ARG]], [[TRUNC10]]
+; CHECK-NEXT:    [[SEXT25:%.*]] = sext i32 [[XOR24]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> poison, i32 [[SHL]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[SHL17]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[XOR31]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[SHL35]], i32 2
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[SHL34]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP29:%.*]] = sext <8 x i32> [[TMP28]] to <8 x i64>
-; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP29]])
-; CHECK-NEXT:    [[OP_RDX8]] = add i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i64 [[SEXT15]], [[SEXT22]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i64 [[OP_RDX8]], [[OP_RDX1]]
+; CHECK-NEXT:    [[OP_RDX3]] = add i64 [[OP_RDX2]], [[SEXT25]]
 ; CHECK-NEXT:    br i1 false, label %[[BB40:.*]], label %[[BB3]]
 ; CHECK:       [[BB40]]:
-; CHECK-NEXT:    [[PHI41:%.*]] = phi i64 [ [[OP_RDX8]], %[[BB3]] ]
+; CHECK-NEXT:    [[PHI41:%.*]] = phi i64 [ [[OP_RDX3]], %[[BB3]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
index 56919ae0ffc90..1cf837df719ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduct...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/187659