[llvm] [SLP]Remove LoadCombine workaround after handling of the copyables (PR #174205)

Fri Jan 2 04:45:50 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-vectorizers

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

LoadCombine pattern handling was added as a workaround for the cases,
where the SLP vectorizer could not vectorize the code effectively. With
the copyables support, it can handle it directly.


---

Patch is 45.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174205.diff


5 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (-87) 
- (modified) llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll (+48-204) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll (+32-228) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll (+4-17) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll (+4-17) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 98b64f52457d5..3900acbc8f223 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2254,23 +2254,6 @@ class slpvectorizer::BoUpSLP {
   /// effectively than the base graph.
   bool isTreeNotExtendable() const;
 
-  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
-  /// can be load combined in the backend. Load combining may not be allowed in
-  /// the IR optimizer, so we do not want to alter the pattern. For example,
-  /// partially transforming a scalar bswap() pattern into vector code is
-  /// effectively impossible for the backend to undo.
-  /// TODO: If load combining is allowed in the IR optimizer, this analysis
-  ///       may not be necessary.
-  bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
-
-  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
-  /// can be load combined in the backend. Load combining may not be allowed in
-  /// the IR optimizer, so we do not want to alter the pattern. For example,
-  /// partially transforming a scalar bswap() pattern into vector code is
-  /// effectively impossible for the backend to undo.
-  /// TODO: If load combining is allowed in the IR optimizer, this analysis
-  ///       may not be necessary.
-  bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
   bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
                      Align Alignment, const int64_t Diff,
                      const size_t Sz) const;
@@ -15608,69 +15591,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
   return true;
 }
 
-static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
-                                       TargetTransformInfo *TTI,
-                                       bool MustMatchOrInst) {
-  // Look past the root to find a source value. Arbitrarily follow the
-  // path through operand 0 of any 'or'. Also, peek through optional
-  // shift-left-by-multiple-of-8-bits.
-  Value *ZextLoad = Root;
-  const APInt *ShAmtC;
-  bool FoundOr = false;
-  while (!isa<ConstantExpr>(ZextLoad) &&
-         (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
-           ShAmtC->urem(8) == 0))) {
-    auto *BinOp = cast<BinaryOperator>(ZextLoad);
-    ZextLoad = BinOp->getOperand(0);
-    if (BinOp->getOpcode() == Instruction::Or)
-      FoundOr = true;
-  }
-  // Check if the input is an extended load of the required or/shift expression.
-  Value *Load;
-  if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
-      !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
-    return false;
-
-  // Require that the total load bit width is a legal integer type.
-  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
-  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
-  Type *SrcTy = Load->getType();
-  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
-  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
-    return false;
-
-  // Everything matched - assume that we can fold the whole sequence using
-  // load combining.
-  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
-             << *(cast<Instruction>(Root)) << "\n");
-
-  return true;
-}
-
-bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
-  if (RdxKind != RecurKind::Or)
-    return false;
-
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
-                                    /* MatchOr */ false);
-}
-
-bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
-  // Peek through a final sequence of stores and check if all operations are
-  // likely to be load-combined.
-  unsigned NumElts = Stores.size();
-  for (Value *Scalar : Stores) {
-    Value *X;
-    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
-        !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
-      return false;
-  }
-  return true;
-}
-
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   if (!DebugCounter::shouldExecute(VectorizedGraphs))
     return true;
@@ -23497,8 +23417,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
       return false;
     }
   }
-  if (R.isLoadCombineCandidate(Chain))
-    return true;
   R.buildTree(Chain);
   // Check if tree tiny and store itself or its value is not vectorized.
   if (R.isTreeTinyAndNotFullyVectorizable()) {
@@ -25112,11 +25030,6 @@ class HorizontalReduction {
             V.analyzedReductionVals(VL);
           continue;
         }
-        if (V.isLoadCombineReductionCandidate(RdxKind)) {
-          if (!AdjustReducedVals())
-            V.analyzedReductionVals(VL);
-          continue;
-        }
         V.reorderTopToBottom();
         // No need to reorder the root node at all for reassociative reduction.
         V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll b/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll
index fe49ba9d61d98..d44ae86484316 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/loadcombine.ll
@@ -70,23 +70,10 @@ define i32 @loadCombine_4consecutive_1243(ptr %p) {
 
 define i32 @loadCombine_4consecutive_1324(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_1324(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -114,23 +101,10 @@ define i32 @loadCombine_4consecutive_1324(ptr %p) {
 
 define i32 @loadCombine_4consecutive_1342(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_1342(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -158,23 +132,10 @@ define i32 @loadCombine_4consecutive_1342(ptr %p) {
 
 define i32 @loadCombine_4consecutive_1423(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_1423(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -202,23 +163,10 @@ define i32 @loadCombine_4consecutive_1423(ptr %p) {
 
 define i32 @loadCombine_4consecutive_1432(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_1432(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -369,23 +317,10 @@ define i32 @loadCombine_4consecutive_2341(ptr %p) {
 
 define i32 @loadCombine_4consecutive_2413(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_2413(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -413,23 +348,10 @@ define i32 @loadCombine_4consecutive_2413(ptr %p) {
 
 define i32 @loadCombine_4consecutive_2431(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_2431(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -457,23 +379,10 @@ define i32 @loadCombine_4consecutive_2431(ptr %p) {
 
 define i32 @loadCombine_4consecutive_3124(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_3124(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -501,23 +410,10 @@ define i32 @loadCombine_4consecutive_3124(ptr %p) {
 
 define i32 @loadCombine_4consecutive_3142(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_3142(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -668,23 +564,10 @@ define i32 @loadCombine_4consecutive_3421(ptr %p) {
 
 define i32 @loadCombine_4consecutive_4123(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_4123(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*]] = load i8, ptr [[P3]], align 1
-; CHECK-NEXT:    [[E1:%.*]] = zext i8 [[L1]] to i32
-; CHECK-NEXT:    [[E2:%.*]] = zext i8 [[L2]] to i32
-; CHECK-NEXT:    [[E3:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[E4:%.*]] = zext i8 [[L4]] to i32
-; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i32 [[E2]], 8
-; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i32 [[E3]], 16
-; CHECK-NEXT:    [[S4:%.*]] = shl nuw i32 [[E4]], 24
-; CHECK-NEXT:    [[O1:%.*]] = or disjoint i32 [[S2]], [[E1]]
-; CHECK-NEXT:    [[O2:%.*]] = or disjoint i32 [[O1]], [[S3]]
-; CHECK-NEXT:    [[O3:%.*]] = or disjoint i32 [[O2]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[O3:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    ret i32 [[O3]]
 ;
   %p1 = getelementptr i8, ptr %p, i32 1
@@ -712,23 +595,10 @@ define i32 @loadCombine_4consecutive_4123(ptr %p) {
 
 define i32 @loadCombine_4consecutive_4132(ptr %p) {
 ; CHECK-LABEL: @loadCombine_4consecutive_4132(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 3
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[P]], align 1
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[P1]], align 1
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[P2]], align 1
-; CHECK-NEXT:    [[L4:%.*...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/174205