[llvm] [SLP][REVEC] VectorValuesAndScales should be supported by REVEC. (PR #135762)

Tue Apr 15 07:25:21 PDT 2025

https://github.com/HanKuanChen updated https://github.com/llvm/llvm-project/pull/135762

>From 5a987ef2cd7261c2d10af41571bcb93ef09d3528 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Thu, 10 Apr 2025 21:01:03 -0700
Subject: [PATCH 1/3] [SLP][REVEC] Pre-commit test.

---
 .../revec-reduced-value-vectorized-later.ll   | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
new file mode 100644
index 0000000000000..656f216b0093e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 < %s | FileCheck %s
+
+define <4 x i16> @test() {
+; CHECK-LABEL: define <4 x i16> @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP16:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP16]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP21]])
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP22]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP35]], [[TMP47]]
+; CHECK-NEXT:    [[OP_RDX10:%.*]] = or <4 x i16> [[OP_RDX9]], zeroinitializer
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX10]], [[TMP23]]
+; CHECK-NEXT:    ret <4 x i16> [[OP_RDX11]]
+;
+entry:
+  %subi = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub40.i = add <4 x i16> %subi, zeroinitializer
+  %sub41.i = add <4 x i16> %subi, zeroinitializer
+  %sub42.i = add <4 x i16> %subi, zeroinitializer
+  %sub43.i = add <4 x i16> %subi, zeroinitializer
+  %sub44.i = add <4 x i16> %subi, zeroinitializer
+  %sub45.i = add <4 x i16> %subi, zeroinitializer
+  %sub46.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub47.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub48.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %sub49.i = add <4 x i16> zeroinitializer, zeroinitializer
+  %or40.i = or <4 x i16> %sub40.i, %sub41.i
+  %or41.i = or <4 x i16> %or40.i, %sub42.i
+  %or42.i = or <4 x i16> %or41.i, %sub43.i
+  %or43.i = or <4 x i16> %or42.i, %sub44.i
+  %or44.i = or <4 x i16> %or43.i, %sub45.i
+  %or45.i = or <4 x i16> %or44.i, %sub46.i
+  %or46.i = or <4 x i16> %or45.i, %sub47.i
+  %or47.i = or <4 x i16> %or46.i, %sub48.i
+  %or48.i = or <4 x i16> %or47.i, %sub49.i
+  %or50.i = or <4 x i16> %or48.i, %subi
+  %subii = add <4 x i16> zeroinitializer, zeroinitializer
+  %subi16.i = add <4 x i16> %subii, zeroinitializer
+  %subi17.i = add <4 x i16> %subii, zeroinitializer
+  %0 = or <4 x i16> %subi16.i, %subi17.i
+  %1 = or <4 x i16> %0, %or50.i
+  ret <4 x i16> %1
+}

>From 6897f7d36de0f4b15c21db04b35dc1836845a13d Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Tue, 15 Apr 2025 01:07:59 -0700
Subject: [PATCH 2/3] [SLP][REVEC] VectorValuesAndScales should be supported by
 REVEC.

We should align REVEC with the SLP algorithm as closely as possible. For
example, by applying REVEC-specific handling when calling IRBuilder's
Create methods, performing cost analysis via TTI, and expanding shuffle
masks using transformScalarShuffleIndicesToVector.

reference commit: 3b18d47ecbaba4e519ebf0d1bc134a404a56a9da
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 137 ++++++++----------
 .../SLPVectorizer/SystemZ/revec-fix-128169.ll |   8 +-
 .../revec-reduced-value-vectorized-later.ll   |  34 ++---
 3 files changed, 79 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a0837ab214219..b8007b7d8fb4d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22499,53 +22499,16 @@ class HorizontalReduction {
         }
 
         Type *ScalarTy = VL.front()->getType();
-        if (isa<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-          Value *ReducedSubTree = PoisonValue::get(
-              getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements));
-          for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
-            // Do reduction for each lane.
-            // e.g., do reduce add for
-            // VL[0] = <4 x Ty> <a, b, c, d>
-            // VL[1] = <4 x Ty> <e, f, g, h>
-            // Lane[0] = <2 x Ty> <a, e>
-            // Lane[1] = <2 x Ty> <b, f>
-            // Lane[2] = <2 x Ty> <c, g>
-            // Lane[3] = <2 x Ty> <d, h>
-            // result[0] = reduce add Lane[0]
-            // result[1] = reduce add Lane[1]
-            // result[2] = reduce add Lane[2]
-            // result[3] = reduce add Lane[3]
-            SmallVector<int, 16> Mask =
-                createStrideMask(I, ScalarTyNumElements, VL.size());
-            Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
-            Value *Val =
-                createSingleOp(Builder, *TTI, Lane,
-                               OptReusedScalars && SameScaleFactor
-                                   ? SameValuesCounter.front().second
-                                   : 1,
-                               Lane->getType()->getScalarType() !=
-                                       VL.front()->getType()->getScalarType()
-                                   ? V.isSignedMinBitwidthRootNode()
-                                   : true,
-                               RdxRootInst->getType());
-            ReducedSubTree =
-                Builder.CreateInsertElement(ReducedSubTree, Val, I);
-          }
-          VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
-        } else {
-          Type *VecTy = VectorizedRoot->getType();
-          Type *RedScalarTy = VecTy->getScalarType();
-          VectorValuesAndScales.emplace_back(
-              VectorizedRoot,
-              OptReusedScalars && SameScaleFactor
-                  ? SameValuesCounter.front().second
-                  : 1,
-              RedScalarTy != ScalarTy->getScalarType()
-                  ? V.isSignedMinBitwidthRootNode()
-                  : true);
-        }
+        Type *VecTy = VectorizedRoot->getType();
+        Type *RedScalarTy = VecTy->getScalarType();
+        VectorValuesAndScales.emplace_back(
+            VectorizedRoot,
+            OptReusedScalars && SameScaleFactor
+                ? SameValuesCounter.front().second
+                : 1,
+            RedScalarTy != ScalarTy->getScalarType()
+                ? V.isSignedMinBitwidthRootNode()
+                : true);
 
         // Count vectorized reduced values to exclude them from final reduction.
         for (Value *RdxVal : VL) {
@@ -22718,9 +22681,35 @@ class HorizontalReduction {
   Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
                         Value *Vec, unsigned Scale, bool IsSigned,
                         Type *DestTy) {
-    Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
-    if (Rdx->getType() != DestTy->getScalarType())
-      Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned);
+    Value *Rdx;
+    if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
+      unsigned DestTyNumElements = VecTy->getNumElements();
+      unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
+      Rdx = PoisonValue::get(
+          getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));
+      for (unsigned I : seq<unsigned>(DestTyNumElements)) {
+        // Do reduction for each lane.
+        // e.g., do reduce add for
+        // VL[0] = <4 x Ty> <a, b, c, d>
+        // VL[1] = <4 x Ty> <e, f, g, h>
+        // Lane[0] = <2 x Ty> <a, e>
+        // Lane[1] = <2 x Ty> <b, f>
+        // Lane[2] = <2 x Ty> <c, g>
+        // Lane[3] = <2 x Ty> <d, h>
+        // result[0] = reduce add Lane[0]
+        // result[1] = reduce add Lane[1]
+        // result[2] = reduce add Lane[2]
+        // result[3] = reduce add Lane[3]
+        SmallVector<int, 16> Mask = createStrideMask(I, DestTyNumElements, VF);
+        Value *Lane = Builder.CreateShuffleVector(Vec, Mask);
+        Rdx = Builder.CreateInsertElement(
+            Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I);
+      }
+    } else {
+      Rdx = emitReduction(Vec, Builder, &TTI, DestTy);
+    }
+    if (Rdx->getType() != DestTy)
+      Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned);
     // Improved analysis for add/fadd/xor reductions with same scale
     // factor for all operands of reductions. We can emit scalar ops for
     // them instead.
@@ -22787,30 +22776,32 @@ class HorizontalReduction {
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
       if (!AllConsts) {
-        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          unsigned ScalarTyNumElements = VecTy->getNumElements();
-          for (unsigned I : seq<unsigned>(ReducedVals.size())) {
-            VectorCost += TTI->getShuffleCost(
-                TTI::SK_PermuteSingleSrc, VectorTy,
-                createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
-            VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
-                                                          CostKind);
-          }
-          VectorCost += TTI->getScalarizationOverhead(
-              VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
-              /*Extract*/ false, TTI::TCK_RecipThroughput);
-        } else if (DoesRequireReductionOp) {
-          Type *RedTy = VectorTy->getElementType();
-          auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
-              std::make_pair(RedTy, true));
-          if (RType == RedTy) {
-            VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
-                                                         FMF, CostKind);
+        if (DoesRequireReductionOp) {
+          if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+            assert(SLPReVec && "FixedVectorType is not expected.");
+            unsigned ScalarTyNumElements = VecTy->getNumElements();
+            for (unsigned I : seq<unsigned>(ReducedVals.size())) {
+              VectorCost += TTI->getShuffleCost(
+                  TTI::SK_PermuteSingleSrc, VectorTy,
+                  createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
+              VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy,
+                                                            FMF, CostKind);
+            }
+            VectorCost += TTI->getScalarizationOverhead(
+                VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
+                /*Extract*/ false, TTI::TCK_RecipThroughput);
           } else {
-            VectorCost = TTI->getExtendedReductionCost(
-                RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
-                FMF, CostKind);
+            Type *RedTy = VectorTy->getElementType();
+            auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
+                std::make_pair(RedTy, true));
+            if (RType == RedTy) {
+              VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
+                                                           FMF, CostKind);
+            } else {
+              VectorCost = TTI->getExtendedReductionCost(
+                  RdxOpcode, !IsSigned, RedTy,
+                  getWidenedType(RType, ReduxWidth), FMF, CostKind);
+            }
           }
         } else {
           Type *RedTy = VectorTy->getElementType();
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
index b9f35451b02ae..1dd6c7b81fb73 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll
@@ -44,16 +44,16 @@ define void @e(<4 x i16> %0) {
 ; THRESH-NEXT:    [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]]
 ; THRESH-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; THRESH-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; THRESH-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0
 ; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; THRESH-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; THRESH-NEXT:    [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP17]], i64 1
 ; THRESH-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; THRESH-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; THRESH-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP19]], i64 2
 ; THRESH-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; THRESH-NEXT:    [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]])
-; THRESH-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i32 0
-; THRESH-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP17]], i32 1
-; THRESH-NEXT:    [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP19]], i32 2
-; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP21]], i32 3
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP21]], i64 3
 ; THRESH-NEXT:    [[TMP26]] = zext <4 x i1> [[TMP25]] to <4 x i32>
 ; THRESH-NEXT:    br label [[VECTOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index 656f216b0093e..3d0e6be661fd1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -16,45 +16,33 @@ define <4 x i16> @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[TMP16:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP15]])
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP16]], i64 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP18]])
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i64 2
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP22]], i64 3
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]])
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP35]], [[TMP47]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = or <4 x i16> [[OP_RDX9]], zeroinitializer
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX10]], [[TMP23]]
+; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
+; CHECK-NEXT:    [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]]
 ; CHECK-NEXT:    ret <4 x i16> [[OP_RDX11]]
 ;
 entry:

>From 760efb5473dc81374371ed78e02edcc19c2dcc25 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Tue, 15 Apr 2025 07:25:10 -0700
Subject: [PATCH 3/3] apply comment

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b8007b7d8fb4d..43466b5bb707b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22683,7 +22683,7 @@ class HorizontalReduction {
                         Type *DestTy) {
     Value *Rdx;
     if (auto *VecTy = dyn_cast<FixedVectorType>(DestTy)) {
-      unsigned DestTyNumElements = VecTy->getNumElements();
+      unsigned DestTyNumElements = getNumElements(VecTy);
       unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements;
       Rdx = PoisonValue::get(
           getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));