[llvm] 993e1f6 - Revert "[SLP]Support for zext i1 %x modeling as select %x, 1, 0"

Tue Feb 10 06:50:04 PST 2026

Author: Alexey Bataev
Date: 2026-02-10T06:49:53-08:00
New Revision: 993e1f66afcfe9da03bd813e669eada341b11d2f

URL: https://github.com/llvm/llvm-project/commit/993e1f66afcfe9da03bd813e669eada341b11d2f
DIFF: https://github.com/llvm/llvm-project/commit/993e1f66afcfe9da03bd813e669eada341b11d2f.diff

LOG: Revert "[SLP]Support for zext i1 %x modeling as select %x, 1, 0"

This reverts commit 70aebae2a13114f4e3d5e2460c052d8f3de295be to fix
buildbots https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flab.llvm.org%2Fbuildbot%2F%23%2Fbuilders%2F85%2Fbuilds%2F18614&data=05%7C02%7C%7Ce5641da3fe984280a6e908de68b3658c%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C639063316889757116%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=65hUwLDdZkXq3zUEt3cVuqJNwXN7Alw4JKDggDbjeVk%3D&reserved=0

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
    llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f18b6fc4dd957..f89c22fafcf04 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5761,15 +5761,12 @@ class slpvectorizer::BoUpSLP {
               // moment are extracts where their second (immediate) operand is
               // not added. Since immediates do not affect scheduler behavior
               // this is considered okay.
-              assert(
-                  In &&
-                  (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
-                   In->getNumOperands() ==
-                       Bundle->getTreeEntry()->getNumOperands() ||
-                   (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
-                                             Instruction::Select) ||
-                   Bundle->getTreeEntry()->isCopyableElement(In)) &&
-                  "Missed TreeEntry operands?");
+              assert(In &&
+                     (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+                      In->getNumOperands() ==
+                          Bundle->getTreeEntry()->getNumOperands() ||
+                      Bundle->getTreeEntry()->isCopyableElement(In)) &&
+                     "Missed TreeEntry operands?");
 
               // Count the number of unique phi nodes, which are the parent for
               // parent entry, and exit, if all the unique phis are processed.
@@ -11348,6 +11345,7 @@ class InstructionsCompatibilityAnalysis {
     case Instruction::BitCast:
     case Instruction::ICmp:
     case Instruction::FCmp:
+    case Instruction::Select:
     case Instruction::FNeg:
     case Instruction::Add:
     case Instruction::FAdd:
@@ -11383,30 +11381,6 @@ class InstructionsCompatibilityAnalysis {
           Ops[Idx] = ConvertedOps[OpIdx];
       }
       return;
-    case Instruction::Select:
-      Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr});
-      for (auto [Idx, V] : enumerate(VL)) {
-        auto *I = dyn_cast<Instruction>(V);
-        if (!I) {
-          for (auto [OpIdx, Ops] : enumerate(Operands))
-            Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType());
-          continue;
-        }
-        if (isa<ZExtInst>(I)) {
-          // Special case for select + zext i1 to avoid explosion of 
diff erent
-          // types. We want to keep the condition as i1 to be able to match
-          // 
diff erent selects together and reuse the vectorized condition
-          // rather than trying to gather it.
-          Operands[0][Idx] = I->getOperand(0);
-          Operands[1][Idx] = ConstantInt::get(I->getType(), 1);
-          Operands[2][Idx] = ConstantInt::getNullValue(I->getType());
-          continue;
-        }
-        auto [Op, ConvertedOps] = convertTo(I, S);
-        for (auto [OpIdx, Ops] : enumerate(Operands))
-          Ops[Idx] = ConvertedOps[OpIdx];
-      }
-      return;
     case Instruction::GetElementPtr: {
       Operands.assign(2, {VL.size(), nullptr});
       // Need to cast all indices to the same type before vectorization to
@@ -11479,22 +11453,6 @@ class InstructionsCompatibilityAnalysis {
                               : getSameOpcode(VL, TLI);
     if (S)
       return S;
-    // Check if series of selects + zext i1 %x to in can be combined into
-    // selects + select %x, i32 1, i32 0.
-    Instruction *SelectOp = nullptr;
-    if (allSameBlock(VL) && all_of(VL, [&](Value *V) {
-          if (match(V, m_Select(m_Value(), m_Value(), m_Value()))) {
-            if (!SelectOp)
-              SelectOp = cast<Instruction>(V);
-            return true;
-          }
-          auto *ZExt = dyn_cast<ZExtInst>(V);
-          return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
-                 isa<PoisonValue>(V);
-        })) {
-      if (SelectOp)
-        return InstructionsState(SelectOp, SelectOp);
-    }
     if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
       return S;
     findAndSetMainInstruction(VL, R);
@@ -15523,10 +15481,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       if (isa<PoisonValue>(UniqueValues[Idx]))
         return InstructionCost(TTI::TCC_Free);
 
-      if (!isa<SelectInst>(UniqueValues[Idx]))
-        return TTI->getInstructionCost(cast<Instruction>(UniqueValues[Idx]),
-                                       CostKind);
-
       auto *VI = cast<Instruction>(UniqueValues[Idx]);
       CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
                                      ? CmpInst::BAD_FCMP_PREDICATE
@@ -25289,32 +25243,6 @@ class HorizontalReduction {
            (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
   }
 
-  /// Optimizes original placement of the reduced values for the reduction tree.
-  /// For example, if there is a zext i1 + selects, we can merge select
-  /// into zext and improve emission of the reductions.
-  void optimizeReducedVals() {
-    SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
-    for (const auto [Idx, Vals] : enumerate(ReducedVals)) {
-      if (auto *I = dyn_cast<Instruction>(Vals.front()))
-        UsedReductionOpIds.try_emplace(I->getOpcode(), Idx);
-    }
-    // Check if zext i1 can be merged with select.
-    auto ZExtIt = UsedReductionOpIds.find(Instruction::ZExt);
-    auto SelectIt = UsedReductionOpIds.find(Instruction::Select);
-    if (ZExtIt != UsedReductionOpIds.end() &&
-        SelectIt != UsedReductionOpIds.end()) {
-      unsigned ZExtIdx = ZExtIt->second;
-      unsigned SelectIdx = SelectIt->second;
-      auto *ZExt = cast<ZExtInst>(ReducedVals[ZExtIdx].front());
-      // ZExt is compatible with Select? Merge select to zext, if so.
-      if (ZExt->getSrcTy()->isIntegerTy(1) &&
-          ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) {
-        ReducedVals[ZExtIdx].append(ReducedVals[SelectIdx]);
-        ReducedVals.erase(std::next(ReducedVals.begin(), SelectIdx));
-      }
-    }
-  }
-
 public:
   HorizontalReduction() = default;
   HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
@@ -25490,9 +25418,6 @@ class HorizontalReduction {
         ReducedVals.back().append(Data.rbegin(), Data.rend());
       }
     }
-    // Post optimize reduced values to get better reduction sequences and sort
-    // them by size.
-    optimizeReducedVals();
     // Sort the reduced values by number of same/alternate opcode and/or pointer
     // operand.
     stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
index c1928fbf6ba67..e1ee35217d187 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
@@ -17,32 +17,110 @@
 define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; SSE-LABEL: @bitmask_16xi8(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1
-; SSE-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:    [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <16 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768>
-; SSE-NEXT:    [[OP_RDX7:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]])
+; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
+; SSE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
+; SSE-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
+; SSE-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
+; SSE-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
+; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
+; SSE-NEXT:    [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
+; SSE-NEXT:    [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
+; SSE-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
+; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
+; SSE-NEXT:    [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
+; SSE-NEXT:    [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
+; SSE-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
+; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
+; SSE-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
+; SSE-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
+; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP12]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]]
+; SSE-NEXT:    [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]]
+; SSE-NEXT:    [[OP_RDX6:%.*]] = or i64 [[OP_RDX]], [[OP_RDX5]]
+; SSE-NEXT:    [[OP_RDX7:%.*]] = or i64 [[OP_RDX6]], [[OR]]
 ; SSE-NEXT:    ret i64 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @bitmask_16xi8(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1
-; AVX-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:    [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <16 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768>
-; AVX-NEXT:    [[OP_RDX4:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]])
+; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
+; AVX-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
+; AVX-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; AVX-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
+; AVX-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
+; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
+; AVX-NEXT:    [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
+; AVX-NEXT:    [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
+; AVX-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
+; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
+; AVX-NEXT:    [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
+; AVX-NEXT:    [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
+; AVX-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
+; AVX-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
+; AVX-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]]
+; AVX-NEXT:    [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]]
 ; AVX-NEXT:    ret i64 [[OP_RDX4]]
 ;
 ; AVX512-LABEL: @bitmask_16xi8(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:    [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <16 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768>
-; AVX512-NEXT:    [[OP_RDX4:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]])
+; AVX512-NEXT:    [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
+; AVX512-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
+; AVX512-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; AVX512-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
+; AVX512-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
+; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
+; AVX512-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
+; AVX512-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
+; AVX512-NEXT:    [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
+; AVX512-NEXT:    [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
+; AVX512-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
+; AVX512-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
+; AVX512-NEXT:    [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
+; AVX512-NEXT:    [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
+; AVX512-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
+; AVX512-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
+; AVX512-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
+; AVX512-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
+; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]]
+; AVX512-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]]
+; AVX512-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]]
+; AVX512-NEXT:    [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]]
 ; AVX512-NEXT:    ret i64 [[OP_RDX4]]
 ;
 entry:
@@ -130,32 +208,84 @@ entry:
 define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) {
 ; SSE-LABEL: @bitmask_4xi16(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2
-; SSE-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; SSE-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; SSE-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
+; SSE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
+; SSE-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
+; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
+; SSE-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; SSE-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
+; SSE-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
+; SSE-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; SSE-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
+; SSE-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
+; SSE-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; SSE-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
+; SSE-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; SSE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; SSE-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; SSE-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; SSE-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; SSE-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @bitmask_4xi16(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2
-; AVX-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
+; AVX-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
+; AVX-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
+; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
+; AVX-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; AVX-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
+; AVX-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
+; AVX-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; AVX-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
+; AVX-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
+; AVX-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; AVX-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
+; AVX-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; AVX-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX512-LABEL: @bitmask_4xi16(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX512-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX512-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
+; AVX512-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
+; AVX512-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX512-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
+; AVX512-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; AVX512-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
+; AVX512-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX512-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
+; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_6]], align 2
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP5]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> <i64 64, i64 128>
+; AVX512-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]]
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; AVX512-NEXT:    [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; AVX512-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX512-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX512-NEXT:    ret i64 [[OP_RDX3]]
 ;
 entry:
@@ -203,32 +333,84 @@ entry:
 define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) {
 ; SSE-LABEL: @bitmask_8xi32(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4
-; SSE-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; SSE-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; SSE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; SSE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
+; SSE-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
+; SSE-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
+; SSE-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
+; SSE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
+; SSE-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
+; SSE-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
+; SSE-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
+; SSE-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; SSE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; SSE-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; SSE-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; SSE-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; SSE-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @bitmask_8xi32(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4
-; AVX-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; AVX-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
+; AVX-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
+; AVX-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
+; AVX-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
+; AVX-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
+; AVX-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
+; AVX-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
+; AVX-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; AVX-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX512-LABEL: @bitmask_8xi32(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX512-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX512-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; AVX512-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
+; AVX512-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX512-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
+; AVX512-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
+; AVX512-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX512-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
+; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_6]], align 4
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> <i64 64, i64 128>
+; AVX512-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]]
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; AVX512-NEXT:    [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; AVX512-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX512-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX512-NEXT:    ret i64 [[OP_RDX3]]
 ;
 entry:
@@ -318,32 +500,84 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) {
 ;
 ; SSE4-LABEL: @bitmask_8xi64(
 ; SSE4-NEXT:  entry:
-; SSE4-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8
-; SSE4-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer
-; SSE4-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE4-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; SSE4-NEXT:    [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8
+; SSE4-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0
+; SSE4-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; SSE4-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8
+; SSE4-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
+; SSE4-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; SSE4-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8
+; SSE4-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0
+; SSE4-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; SSE4-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6
+; SSE4-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8
+; SSE4-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0
+; SSE4-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; SSE4-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7
+; SSE4-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8
+; SSE4-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
+; SSE4-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; SSE4-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; SSE4-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; SSE4-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @bitmask_8xi64(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8
-; AVX-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX-NEXT:    [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8
+; AVX-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0
+; AVX-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8
+; AVX-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5
+; AVX-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8
+; AVX-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0
+; AVX-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6
+; AVX-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8
+; AVX-NEXT:    [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0
+; AVX-NEXT:    [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
+; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7
+; AVX-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8
+; AVX-NEXT:    [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
+; AVX-NEXT:    [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
+; AVX-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX-NEXT:    ret i64 [[OP_RDX3]]
 ;
 ; AVX512-LABEL: @bitmask_8xi64(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> <i64 1, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, <8 x i64> <i64 0, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128>
-; AVX512-NEXT:    [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]])
+; AVX512-NEXT:    [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8
+; AVX512-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0
+; AVX512-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
+; AVX512-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5
+; AVX512-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8
+; AVX512-NEXT:    [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0
+; AVX512-NEXT:    [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
+; AVX512-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6
+; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_6]], align 8
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i64> [[TMP5]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> <i64 64, i64 128>
+; AVX512-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
+; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]]
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; AVX512-NEXT:    [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]]
+; AVX512-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
+; AVX512-NEXT:    [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
 ; AVX512-NEXT:    ret i64 [[OP_RDX3]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
index 428f54f153887..252746b465bc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
@@ -9,10 +9,16 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) {
 ; CHECK-NEXT:    [[TT3:%.*]] = lshr i64 [[V1]], 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TT3]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], <i32 1, i32 255>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TT2:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[TT1:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TT1]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> poison, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc i32 [[TT2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> [[TMP7]], i8 [[TMP33]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TT1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TT2]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>