[llvm] [LoopReduceMotion] Improve loop by extract reduction instruction (PR #179215)

Wed Mar 4 23:35:10 PST 2026

https://github.com/Anjian-Wen updated https://github.com/llvm/llvm-project/pull/179215

>From ef12d82a225c2475e3a9af50a01e7dcee07e6114 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 2 Feb 2026 19:36:42 +0800
Subject: [PATCH 1/7] [RISCV] Improve loop by extract reduction instruction
 with vector_reduce_add in some pattern

---
 llvm/include/llvm/CodeGen/Passes.h            |   4 +
 .../Transforms/Vectorize/LoopReduceMotion.h   |  24 ++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   1 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Transforms/Vectorize/LoopReduceMotion.cpp | 212 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 .../loop-reduce-motion-test.ll                |  72 ++++++
 9 files changed, 317 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
 create mode 100644 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
 create mode 100644 llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2717110e1b3e7..2bd8e843f8c13 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,6 +629,10 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
+
+/// This pass is designed to hoist ReduceCall operations out of loops to
+/// reduce the number of instructions within the loop body.
+LLVM_ABI FunctionPass *createLoopReduceMotionPass();
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
new file mode 100644
index 0000000000000..17bd74472700a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -0,0 +1,24 @@
+//===- LoopReduceMotion.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+namespace llvm {
+class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 45955426d66a0..e25868697a030 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -379,6 +379,7 @@
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2cfb5b2592601..edd0962052cbf 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -553,6 +553,7 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 7eb56f52c2e66..303a502be8cf9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,6 +479,7 @@ void RISCVPassConfig::addIRPasses() {
     addPass(createRISCVGatherScatterLoweringPass());
     addPass(createInterleavedAccessPass());
     addPass(createRISCVCodeGenPrepareLegacyPass());
+    addPass(createLoopReduceMotionPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9f4a242214471..406031876a7d0 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize
   LoopIdiomVectorize.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
+  LoopReduceMotion.cpp
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/InstrMaps.cpp
   SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
new file mode 100644
index 0000000000000..33334a2acfa78
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -0,0 +1,212 @@
+//===-------- LoopReduceMotion.cpp - Loop Reduce Motion Optimization ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+// Below are the target pattern to be matched and the resulting pattern
+// after the transformation.
+//
+// before                    | after
+// ------                    | ------
+// loop:                     | loop:
+//   ...                     |   ...
+//   vc = vecbin va, vb      |   vc = vecbin va, vb
+//   d = reduce_add vc       |   vsum = vadd vsum, vc
+//   sum = add sum, d        |   ...
+//   ...                     |   ...
+// exit:                     | exit:
+//   value = sum             |   d = reduce_add sum
+//   ...                     |   value = d
+//   ...                     |   ...
+//   ret                     |   ret
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-reduce-motion"
+
+using namespace llvm;
+
+class LoopReduceMotion : public FunctionPass {
+  LoopReduceMotionPass Impl;
+
+public:
+  static char ID;
+
+  LoopReduceMotion() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+char LoopReduceMotion::ID = 0;
+
+PreservedAnalyses LoopReduceMotionPass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  bool Changed = false;
+  for (Loop *L : LI) {
+    Changed |= matchAndTransform(*L, DT, LI);
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+bool LoopReduceMotion::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  bool Changed = false;
+  for (Loop *L : LI) {
+    Changed |= Impl.matchAndTransform(*L, DT, LI);
+  }
+  if (!Changed)
+    return false;
+
+  return true;
+}
+
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
+                                             LoopInfo &LI) {
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Latch = L.getLoopLatch();
+  BasicBlock *ExitBlock = L.getExitBlock();
+  if (!Header || !Latch || !ExitBlock) {
+    LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
+                      << " because it is not a valid loop.\n");
+    return false;
+  }
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+    if (!Preheader) {
+      LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
+                        << Header->getName() << ".\n");
+      return false;
+    }
+  }
+  for (PHINode &PN : Header->phis()) {
+    if (!PN.getType()->isIntegerTy())
+      continue;
+
+    RecurrenceDescriptor RecDesc;
+    if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+      continue;
+
+    if (RecDesc.getRecurrenceKind() != RecurKind::Add)
+      continue;
+
+    Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+    Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
+    if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
+      continue;
+
+    Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+                                 ? RecurrenceInst->getOperand(1)
+                                 : RecurrenceInst->getOperand(0);
+
+    CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+    if (!ReduceCall)
+      continue;
+    Function *CalledFunc = ReduceCall->getCalledFunction();
+
+    if (!CalledFunc || !CalledFunc->isIntrinsic() ||
+        !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
+      continue;
+
+    Value *ReduceOperand = ReduceCall->getArgOperand(0);
+    Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
+    if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
+                    VecBin->getOpcode() != Instruction::Add))
+      continue;
+    // pattern match success
+    LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+                      << Header->getName() << "!\n");
+
+    VectorType *VecTy = cast<VectorType>(VecBin->getType());
+    IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
+
+    Value *VecZero = PreheaderBuilder.CreateVectorSplat(
+        VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
+        "vec.zero");
+
+    // build new Vector Add to replace Scalar Add
+    IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
+    PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
+    VecSumPhi->addIncoming(VecZero, Preheader);
+    IRBuilder<> BodyBuilder(RecurrenceInst);
+    Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+    VecSumPhi->addIncoming(NewVecAdd, Latch);
+
+    // build landingPad for reduce add out of loop
+    BasicBlock *ExitingBlock =
+        Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
+    if (!L.isLoopExiting(ExitingBlock)) {
+      ExitingBlock = Header;
+    }
+    BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
+    LandingPad->setName("loop.exit.landing");
+    IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
+    Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
+        ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
+    Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+    Value *LastAdd =
+        PreheaderValue
+            ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
+            : ScalarTotalSum;
+
+    // replace use of phi and erase use empty value
+    if (!PN.use_empty())
+      PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+    if (PN.use_empty())
+      PN.eraseFromParent();
+    RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+    if (RecurrenceInst->use_empty())
+      RecurrenceInst->eraseFromParent();
+    if (ReduceCall->use_empty())
+      ReduceCall->eraseFromParent();
+
+    return true;
+  }
+  return false;
+}
+
+FunctionPass *llvm::createLoopReduceMotionPass() {
+  return new LoopReduceMotion();
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3b63c1d86d3b1..945b9bf7fd3e0 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,6 +39,7 @@
 ; CHECK-NEXT:       RISC-V gather/scatter lowering
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       RISC-V CodeGenPrepare
+; CHECK-NEXT:       Loop Reduce Motion Pass
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
new file mode 100644
index 0000000000000..dfe6b1ddb1f36
--- /dev/null
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; loop-reduce-motion-test.ll
+; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
+
+define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
+; CHECK-LABEL: define i32 @pixel_asd8(
+; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK:       [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %height, 0
+  br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exitcond.not = icmp eq i32 %inc9, %height
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}

>From 805a89422c428c87b578ff59b2b92e475480d0d7 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Tue, 3 Feb 2026 20:03:38 +0800
Subject: [PATCH 2/7] Fix format

---
 .../Transforms/Vectorize/LoopReduceMotion.cpp |  2 +-
 .../loop-reduce-motion-test.ll                | 58 +++++++++----------
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index 33334a2acfa78..dc9a1223bae02 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -193,7 +193,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
 
     // replace use of phi and erase use empty value
     if (!PN.use_empty())
-      PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+      PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
     if (PN.use_empty())
       PN.eraseFromParent();
     RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index dfe6b1ddb1f36..73bea9d6623e8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -1,20 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; loop-reduce-motion-test.ll
 ; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
 
-define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
-; CHECK-LABEL: define i32 @pixel_asd8(
-; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
-; CHECK-NEXT:    br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
-; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @pixel_asd8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -22,33 +21,30 @@ define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
-; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
-; CHECK:       [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       loop.exit.landing:
 ; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %height, 0
-  br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
-
-for.cond1.preheader.preheader:                    ; preds = %entry
-  br label %for.cond1.preheader
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
 
 for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
-  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
-  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
-  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
-  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
   %0 = load <8 x i8>, ptr %pix1.addr.023
   %1 = load <8 x i8>, ptr %pix2.addr.022
   %2 = zext <8 x i8> %0 to <8 x i32>

>From 2134c689883bb02affee9bc453a19876d58794ff Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Wed, 4 Feb 2026 19:26:03 +0800
Subject: [PATCH 3/7] Fix some format and possible nullptr dyn_cast

---
 .../Transforms/Vectorize/LoopReduceMotion.h     |  6 ++++--
 llvm/lib/Passes/PassRegistry.def                |  2 +-
 llvm/lib/Transforms/Vectorize/CMakeLists.txt    |  2 +-
 .../Transforms/Vectorize/LoopReduceMotion.cpp   | 17 +++++++++--------
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index 17bd74472700a..df5af76819923 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
 // the number of instructions within the loop body.
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
+
 namespace llvm {
 class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
 public:
@@ -21,4 +23,4 @@ class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
   bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
 };
 } // namespace llvm
-#endif
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index edd0962052cbf..c896d7c99c107 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,6 +469,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -553,7 +554,6 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 406031876a7d0..0fa532010632b 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_llvm_component_library(LLVMVectorize
   LoadStoreVectorizer.cpp
   LoopIdiomVectorize.cpp
+  LoopReduceMotion.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
-  LoopReduceMotion.cpp
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/InstrMaps.cpp
   SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index dc9a1223bae02..fdefcb7e00074 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -5,10 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
 // the number of instructions within the loop body.
 //
-// Below are the target pattern to be matched and the resulting pattern
+// Below is the target pattern to be matched and the resulting pattern
 // after the transformation.
 //
 // before                    | after
@@ -160,11 +160,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
                       << Header->getName() << "!\n");
 
     VectorType *VecTy = cast<VectorType>(VecBin->getType());
-    IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
-
-    Value *VecZero = PreheaderBuilder.CreateVectorSplat(
-        VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
-        "vec.zero");
+    Value *VecZero = ConstantInt::get(VecTy, 0);
 
     // build new Vector Add to replace Scalar Add
     IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
@@ -196,7 +192,12 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
       PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
     if (PN.use_empty())
       PN.eraseFromParent();
-    RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+
+    Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
+    if (!FinalNode)
+      return false;
+    RecurrenceInst->replaceAllUsesWith(FinalNode);
+
     if (RecurrenceInst->use_empty())
       RecurrenceInst->eraseFromParent();
     if (ReduceCall->use_empty())

>From de183c53008b7f4a28aa1cd93c43fb48bd9d96d6 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Feb 2026 19:25:25 +0800
Subject: [PATCH 4/7] delete some wrong log

---
 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index fdefcb7e00074..f9b45ade53676 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -156,7 +156,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
                     VecBin->getOpcode() != Instruction::Add))
       continue;
     // pattern match success
-    LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+    LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
                       << Header->getName() << "!\n");
 
     VectorType *VecTy = cast<VectorType>(VecBin->getType());

>From ac4e3131d3d015a24e8e61c178ceb40d5a693a3b Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 15:02:58 +0800
Subject: [PATCH 5/7] put the pass after VectorCombine add some test negative
 tests change class name in case of confused change Node delete func to
 RecursivelyDelete functions

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   3 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   1 -
 .../Transforms/Vectorize/LoopReduceMotion.cpp |  23 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 -
 llvm/test/Other/new-pm-defaults.ll            |   1 +
 llvm/test/Other/new-pm-lto-defaults.ll        |   1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |   1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |   1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |   1 +
 .../loop-reduce-motion-test.ll                | 272 +++++++++++++++++-
 10 files changed, 287 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1584d30875570..8579da78be8c8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -147,6 +147,7 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
@@ -1418,6 +1419,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
+  // Try to sink ReduceCall out of loop
+  FPM.addPass(LoopReduceMotionPass());
 
   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 303a502be8cf9..7eb56f52c2e66 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,7 +479,6 @@ void RISCVPassConfig::addIRPasses() {
     addPass(createRISCVGatherScatterLoweringPass());
     addPass(createInterleavedAccessPass());
     addPass(createRISCVCodeGenPrepareLegacyPass());
-    addPass(createLoopReduceMotionPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f9b45ade53676..f56a546c5ceb9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -46,13 +46,13 @@
 
 using namespace llvm;
 
-class LoopReduceMotion : public FunctionPass {
+class LoopReduceMotionLegacy : public FunctionPass {
   LoopReduceMotionPass Impl;
 
 public:
   static char ID;
 
-  LoopReduceMotion() : FunctionPass(ID) {}
+  LoopReduceMotionLegacy() : FunctionPass(ID) {}
 
   StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
 
@@ -65,7 +65,7 @@ class LoopReduceMotion : public FunctionPass {
   }
 };
 
-char LoopReduceMotion::ID = 0;
+char LoopReduceMotionLegacy::ID = 0;
 
 PreservedAnalyses LoopReduceMotionPass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
@@ -80,7 +80,7 @@ PreservedAnalyses LoopReduceMotionPass::run(Function &F,
   return PreservedAnalyses::none();
 }
 
-bool LoopReduceMotion::runOnFunction(Function &F) {
+bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
@@ -187,21 +187,16 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
             ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
             : ScalarTotalSum;
 
-    // replace use of phi and erase use empty value
+    // delete the dead PHI Node
     if (!PN.use_empty())
       PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
-    if (PN.use_empty())
-      PN.eraseFromParent();
-
+    llvm::RecursivelyDeleteDeadPHINode(&PN);
+    // replace the use of Recurrence Node and delete the dead Node
     Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
     if (!FinalNode)
       return false;
     RecurrenceInst->replaceAllUsesWith(FinalNode);
-
-    if (RecurrenceInst->use_empty())
-      RecurrenceInst->eraseFromParent();
-    if (ReduceCall->use_empty())
-      ReduceCall->eraseFromParent();
+    llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
 
     return true;
   }
@@ -209,5 +204,5 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
 }
 
 FunctionPass *llvm::createLoopReduceMotionPass() {
-  return new LoopReduceMotion();
+  return new LoopReduceMotionLegacy();
 }
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 945b9bf7fd3e0..3b63c1d86d3b1 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,7 +39,6 @@
 ; CHECK-NEXT:       RISC-V gather/scatter lowering
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       RISC-V CodeGenPrepare
-; CHECK-NEXT:       Loop Reduce Motion Pass
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index f074b2fdd3ab8..9af05eb49e65e 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index de0feca55e5b2..cebbe671e17b3 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,6 +143,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index b0d08316de4f0..4065cead7c264 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,7 @@
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 6b3e82a752899..126caf7eed3ab 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 88dc18f605ce2..e9879a512a9b9 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 73bea9d6623e8..859a19e701fa8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -2,8 +2,8 @@
 ; loop-reduce-motion-test.ll
 ; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
 
-define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @pixel_asd8(
+define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_Sub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -66,3 +66,271 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
   ret i32 %6
 }
+
+define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       loop.exit.landing:
+; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %height, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = add nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exitcond.not = icmp eq i32 %inc9, %height
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}
+
+define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
+; CHECK-LABEL: @multi_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    br label [[NEXT_COND0:%.*]]
+; CHECK:       next.cond0:
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
+; CHECK-NEXT:    [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK:       next.cond1:
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
+; CHECK-NEXT:    [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
+; CHECK-NEXT:    br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %next.cond1 ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %next.cond1 ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %next.cond1 ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %next.cond1 ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  br label %next.cond0
+
+next.cond0:
+  %add.8 = add i32 %add.7, 1
+  %exit1 = icmp eq i32 %inc9, %val1
+  br i1 %exit1, label %for.cond.cleanup, label %next.cond1
+
+next.cond1:
+  %add.9 = add i32 %add.7, 2
+  %exit2 = icmp eq i32 %inc9, %val2
+  br i1 %exit2, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.8, %next.cond0 ], [%add.9, %next.cond1 ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}
+
+define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @phi_not_reduction_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exit = icmp eq i32 %inc9, %val1
+  br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+  %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  %7 = add i32 %6, %sum
+  ret i32 %7
+}
+
+define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @reduction_call_not_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = sub i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = sub i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exit = icmp eq i32 %inc9, %val1
+  br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+  %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  %7 = add i32 %6, %sum
+  ret i32 %7
+}

>From 237b245dfb33ccc61e9e78d0de44defa3a8fa2c9 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 16:33:38 +0800
Subject: [PATCH 6/7] update comment

---
 llvm/include/llvm/CodeGen/Passes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2bd8e843f8c13..0745f0a408d67 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -630,7 +630,7 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
 
-/// This pass is designed to hoist ReduceCall operations out of loops to
+/// This pass is designed to sink ReduceCall operations out of loops to
 /// reduce the number of instructions within the loop body.
 LLVM_ABI FunctionPass *createLoopReduceMotionPass();
 } // namespace llvm

>From be1f11eafc39139991621c4153ce559e12508a68 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Mar 2026 15:31:01 +0800
Subject: [PATCH 7/7] Change the Pass to Loop Pass and fix some error

---
 llvm/include/llvm/CodeGen/Passes.h            |   4 -
 .../Transforms/Vectorize/LoopReduceMotion.h   |   7 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   5 +-
 llvm/lib/Passes/PassRegistry.def              |   2 +-
 .../Transforms/Vectorize/LoopReduceMotion.cpp | 171 ++++++++---------
 llvm/test/Other/new-pm-defaults.ll            |   2 +
 llvm/test/Other/new-pm-lto-defaults.ll        |   8 +-
 .../Other/new-pm-thinlto-postlink-defaults.ll |   2 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |   2 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |   2 +
 .../loop-reduce-motion-test.ll                | 143 +++++++-------
 .../PhaseOrdering/AArch64/udotabd.ll          | 174 +++++++-----------
 12 files changed, 245 insertions(+), 277 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 0745f0a408d67..2717110e1b3e7 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,10 +629,6 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
-
-/// This pass is designed to sink ReduceCall operations out of loops to
-/// reduce the number of instructions within the loop body.
-LLVM_ABI FunctionPass *createLoopReduceMotionPass();
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index df5af76819923..26d178c63b0c9 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -13,14 +13,17 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
 
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 
 namespace llvm {
 class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
 public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
-  bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+  bool matchAndTransform(Loop &L, DominatorTree *DT, LoopInfo *LI);
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 } // namespace llvm
 #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 8579da78be8c8..4897cc950322f 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1419,8 +1419,11 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
+
+  LoopPassManager LPM;
   // Try to sink ReduceCall out of loop
-  FPM.addPass(LoopReduceMotionPass());
+  LPM.addPass(LoopReduceMotionPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
 
   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index c896d7c99c107..5d8e314de4ce1 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,7 +469,6 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -777,6 +776,7 @@ LOOP_PASS("loop-idiom-vectorize", LoopIdiomVectorizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("loop-reduce", LoopStrengthReducePass())
+LOOP_PASS("loop-reduce-motion", LoopReduceMotionPass())
 LOOP_PASS("loop-term-fold", LoopTermFoldPass())
 LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f56a546c5ceb9..e54ba56ec3e32 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -15,8 +15,7 @@
 // ------                    | ------
 // loop:                     | loop:
 //   ...                     |   ...
-//   vc = vecbin va, vb      |   vc = vecbin va, vb
-//   d = reduce_add vc       |   vsum = vadd vsum, vc
+//   d = reduce_add v        |   vsum = vadd vsum, v
 //   sum = add sum, d        |   ...
 //   ...                     |   ...
 // exit:                     | exit:
@@ -35,6 +34,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Plugins/PassPlugin.h"
@@ -45,68 +45,24 @@
 #define DEBUG_TYPE "loop-reduce-motion"
 
 using namespace llvm;
+PreservedAnalyses LoopReduceMotionPass::run(Loop &L, LoopAnalysisManager &LAM,
+                                            LoopStandardAnalysisResults &LAR,
+                                            LPMUpdater &Updater) {
 
-class LoopReduceMotionLegacy : public FunctionPass {
-  LoopReduceMotionPass Impl;
+  bool Changed = matchAndTransform(L, &LAR.DT, &LAR.LI);
 
-public:
-  static char ID;
-
-  LoopReduceMotionLegacy() : FunctionPass(ID) {}
-
-  StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
-
-  bool runOnFunction(Function &F) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.setPreservesCFG();
-  }
-};
-
-char LoopReduceMotionLegacy::ID = 0;
-
-PreservedAnalyses LoopReduceMotionPass::run(Function &F,
-                                            FunctionAnalysisManager &FAM) {
-  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
-  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
-  bool Changed = false;
-  for (Loop *L : LI) {
-    Changed |= matchAndTransform(*L, DT, LI);
-  }
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
 
-bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  bool Changed = false;
-  for (Loop *L : LI) {
-    Changed |= Impl.matchAndTransform(*L, DT, LI);
-  }
-  if (!Changed)
-    return false;
-
-  return true;
-}
-
-bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
-                                             LoopInfo &LI) {
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree *DT,
+                                             LoopInfo *LI) {
   BasicBlock *Header = L.getHeader();
   BasicBlock *Latch = L.getLoopLatch();
   BasicBlock *ExitBlock = L.getExitBlock();
+  BasicBlock *ExitingBlock = L.getExitingBlock();
+  BasicBlock *LandingPad = nullptr;
   if (!Header || !Latch || !ExitBlock) {
     LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
                       << " because it is not a valid loop.\n");
@@ -114,52 +70,70 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
   }
   BasicBlock *Preheader = L.getLoopPreheader();
   if (!Preheader) {
-    Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+    Preheader = InsertPreheaderForLoop(&L, DT, LI, nullptr, false);
     if (!Preheader) {
       LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
                         << Header->getName() << ".\n");
       return false;
     }
   }
+
+  bool transform_success = false;
+  SmallVector<Instruction *, 8> StackRecur;
+  SmallVector<PHINode *, 8> Stack;
+  int phi_count = 0;
   for (PHINode &PN : Header->phis()) {
-    if (!PN.getType()->isIntegerTy())
+    Stack.push_back(&PN);
+    phi_count++;
+    if (phi_count >= 8)
+      return false;
+  }
+
+  while (!Stack.empty()) {
+    PHINode *PN = Stack.pop_back_val();
+
+    if (!PN->getType()->isIntegerTy())
       continue;
 
     RecurrenceDescriptor RecDesc;
-    if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+    if (!RecurrenceDescriptor::isReductionPHI(PN, &L, RecDesc))
       continue;
 
     if (RecDesc.getRecurrenceKind() != RecurKind::Add)
       continue;
 
-    Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+    Value *RecurrenceValueFromPHI = PN->getIncomingValueForBlock(Latch);
     Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
     if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
       continue;
 
-    Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+    // Don't match if the Recurrence Value has other use in loop
+    for (User *U : RecurrenceValueFromPHI->users()) {
+      if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+        BasicBlock *BB = Inst->getParent();
+        if (L.contains(BB)) {
+          continue;
+        }
+      }
+    }
+
+    Value *RecurrenceValue = RecurrenceInst->getOperand(0) == PN
                                  ? RecurrenceInst->getOperand(1)
                                  : RecurrenceInst->getOperand(0);
-
-    CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
-    if (!ReduceCall)
+    Value *ReduceOperand;
+    if (!llvm::PatternMatch::match(
+            RecurrenceValue,
+            llvm::PatternMatch::m_Intrinsic<Intrinsic::vector_reduce_add>(
+                llvm::PatternMatch::m_Value(ReduceOperand))))
       continue;
-    Function *CalledFunc = ReduceCall->getCalledFunction();
 
-    if (!CalledFunc || !CalledFunc->isIntrinsic() ||
-        !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
-      continue;
-
-    Value *ReduceOperand = ReduceCall->getArgOperand(0);
-    Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
-    if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
-                    VecBin->getOpcode() != Instruction::Add))
-      continue;
+    CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+    Instruction *VecIn = dyn_cast<Instruction>(ReduceOperand);
     // pattern match success
     LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
                       << Header->getName() << "!\n");
 
-    VectorType *VecTy = cast<VectorType>(VecBin->getType());
+    VectorType *VecTy = cast<VectorType>(VecIn->getType());
     Value *VecZero = ConstantInt::get(VecTy, 0);
 
     // build new Vector Add to replace Scalar Add
@@ -167,42 +141,53 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
     PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
     VecSumPhi->addIncoming(VecZero, Preheader);
     IRBuilder<> BodyBuilder(RecurrenceInst);
-    Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+    Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecIn, "vec.sum.next");
     VecSumPhi->addIncoming(NewVecAdd, Latch);
 
     // build landingPad for reduce add out of loop
-    BasicBlock *ExitingBlock =
-        Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
-    if (!L.isLoopExiting(ExitingBlock)) {
-      ExitingBlock = Header;
+    if (!LandingPad) {
+      LandingPad = SplitEdge(ExitingBlock, ExitBlock, DT, LI);
+      LandingPad->setName("loop.exit.landing");
     }
-    BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
-    LandingPad->setName("loop.exit.landing");
     IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
     Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
         ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
-    Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+
+    Value *PreheaderValue = PN->getIncomingValueForBlock(Preheader);
     Value *LastAdd =
         PreheaderValue
             ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
             : ScalarTotalSum;
-
-    // delete the dead PHI Node
-    if (!PN.use_empty())
-      PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
-    llvm::RecursivelyDeleteDeadPHINode(&PN);
     // replace the use of Recurrence Node and delete the dead Node
     Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
     if (!FinalNode)
-      return false;
-    RecurrenceInst->replaceAllUsesWith(FinalNode);
-    llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
+      continue;
 
+    // delete the dead PHI Node
+    if (!PN->use_empty())
+      PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
+    llvm::RecursivelyDeleteDeadPHINode(PN);
+
+    if (!RecurrenceInst->use_empty()) {
+      for (auto *U : RecurrenceInst->users()) {
+        auto *phi = llvm::dyn_cast<llvm::PHINode>(U);
+        if (phi && !phi->use_empty()) {
+          phi->replaceAllUsesWith(FinalNode);
+        }
+      }
+    }
+    transform_success = true;
+    StackRecur.push_back(RecurrenceInst);
+  }
+
+  if (transform_success) {
+    FoldSingleEntryPHINodes(LandingPad);
+    while (!StackRecur.empty()) {
+      Instruction *Rec = StackRecur.pop_back_val();
+      llvm::RecursivelyDeleteTriviallyDeadInstructions(Rec);
+    }
     return true;
   }
-  return false;
-}
 
-FunctionPass *llvm::createLoopReduceMotionPass() {
-  return new LoopReduceMotionLegacy();
+  return false;
 }
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 9af05eb49e65e..912320b0b2946 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,8 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index cebbe671e17b3..ec2be84bb919a 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,9 +143,11 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
-; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
-; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
-; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass
+; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass
+; CHECK-O23SZ-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
 ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
 ; CHECK-O23SZ-NEXT: Running pass: LICMPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 4065cead7c264..40a8d1383032d 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,8 @@
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 126caf7eed3ab..c6343f0e0a774 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,8 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index e9879a512a9b9..add8fc47c54f1 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,8 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
 ; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 859a19e701fa8..cd77a0cfec68c 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -12,124 +12,122 @@ define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
 ; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
 ; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
-; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       loop.exit.landing:
 ; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %height, 0
   br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
 
-for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+for.cond1.preheader:                              ; preds = %for.cond1.preheader, %entry
   %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
   %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
   %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
-  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
   %0 = load <8 x i8>, ptr %pix1.addr.023
-  %1 = load <8 x i8>, ptr %pix2.addr.022
   %2 = zext <8 x i8> %0 to <8 x i32>
-  %3 = zext <8 x i8> %1 to <8 x i32>
-  %4 = sub nsw <8 x i32> %2, %3
-  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
   %add.7 = add i32 %sum.024, %5
   %inc9 = add nuw nsw i32 %y.025, 1
   %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
-  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
   %exitcond.not = icmp eq i32 %inc9, %height
-  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
-
-for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
-  br label %for.cond.cleanup
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader
 
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+for.cond.cleanup:                                 ; preds = %for.cond1.preheader, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond1.preheader]
   %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
   ret i32 %6
 }
 
-define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @func_with_VecBin_add(
+define i32 @func_with_reduce(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_reduce(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond1.preheader.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader:
-; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[Y:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR1:%.*]] = phi ptr [ [[ADD_PTR1:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR2:%.*]] = phi ptr [ [[ADD_PTR2:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
 ; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[VEC_SUM_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT2:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR2]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
-; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
-; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT2]] = add <8 x i32> [[VEC_SUM_PHI1]], [[TMP2]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP3]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[Y]], 1
+; CHECK-NEXT:    [[ADD_PTR1]] = getelementptr inbounds i8, ptr [[PIX1_ADDR1]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR2]] = getelementptr inbounds i8, ptr [[PIX2_ADDR2]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       loop.exit.landing:
 ; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    [[SCALAR_TOTAL_SUM3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM3]]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[SUML1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUML2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP4]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUML1]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUML2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %height, 0
   br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
 
 for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
-  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
-  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
-  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
-  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
-  %0 = load <8 x i8>, ptr %pix1.addr.023
-  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %y = phi i32 [ %inc, %for.cond1.preheader ], [ 0, %entry ]
+  %sum1 = phi i32 [ %add1, %for.cond1.preheader ], [ 0, %entry ]
+  %sum2 = phi i32 [ %add2, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr1 = phi ptr [ %add.ptr1, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr2 = phi ptr [ %add.ptr2, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr1
+  %1 = load <8 x i8>, ptr %pix2.addr2
   %2 = zext <8 x i8> %0 to <8 x i32>
   %3 = zext <8 x i8> %1 to <8 x i32>
-  %4 = add nsw <8 x i32> %2, %3
-  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
-  %add.7 = add i32 %sum.024, %5
-  %inc9 = add nuw nsw i32 %y.025, 1
-  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
-  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
-  %exitcond.not = icmp eq i32 %inc9, %height
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
+  %add1 = add i32 %sum1, %5
+  %6 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
+  %add2 = add i32 %sum2, %6
+  %inc = add nuw nsw i32 %y, 1
+  %add.ptr1 = getelementptr inbounds i8, ptr %pix1.addr1, i64 %stride1
+  %add.ptr2 = getelementptr inbounds i8, ptr %pix2.addr2, i64 %stride2
+  %exitcond.not = icmp eq i32 %inc, %height
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
 
 for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
-  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
-  ret i32 %6
+  %suml1 = phi i32 [ 0, %entry ], [ %add1, %for.cond.cleanup.loopexit ]
+  %suml2 = phi i32 [ 0, %entry ], [ %add2, %for.cond.cleanup.loopexit ]
+  %7 = tail call i32 @llvm.abs.i32(i32 %suml1, i1 true)
+  %8 = tail call i32 @llvm.abs.i32(i32 %suml2, i1 true)
+  %9 = add i32 %7, %8
+  ret i32 %9
 }
 
 define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
@@ -137,11 +135,13 @@ define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %va
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER1:%.*]]
 ; CHECK:       for.cond1.preheader:
-; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -156,13 +156,16 @@ define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %va
 ; CHECK:       next.cond0:
 ; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
 ; CHECK-NEXT:    [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
-; CHECK-NEXT:    br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK-NEXT:    br i1 [[EXIT1]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[NEXT_COND1]]
 ; CHECK:       next.cond1:
 ; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
 ; CHECK-NEXT:    [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
-; CHECK-NEXT:    br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK-NEXT:    br i1 [[EXIT2]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_COND1_PREHEADER1]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ [[ADD_9]], [[NEXT_COND1]] ], [ [[ADD_8]], [[NEXT_COND0]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
@@ -228,10 +231,12 @@ define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stri
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[SUM_024_LCSSA:%.*]] = phi i32 [ [[SUM_024]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD_7_LCSSA:%.*]] = phi i32 [ [[ADD_7]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
 ; CHECK-NEXT:    ret i32 [[TMP7]]
@@ -294,10 +299,12 @@ define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stri
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[SUM_024_LCSSA:%.*]] = phi i32 [ [[SUM_024]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[ADD_7_LCSSA:%.*]] = phi i32 [ [[ADD_7]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
 ; CHECK-NEXT:    ret i32 [[TMP7]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
index e2f7f8f7e5cac..16e66acce373f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
@@ -9,8 +9,6 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-LABEL: define dso_local i32 @test(
 ; CHECK-O3-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-O3-NEXT:  [[ENTRY:.*:]]
-; CHECK-O3-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
-; CHECK-O3-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-O3-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-O3-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
@@ -18,352 +16,318 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
 ; CHECK-O3-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
 ; CHECK-O3-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-O3-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-O3-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP7]] to <16 x i16>
+; CHECK-O3-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
 ; CHECK-O3-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]]
 ; CHECK-O3-NEXT:    [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP14:%.*]] = zext <16 x i16> [[TMP13]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
-; CHECK-O3-NEXT:    [[OP_RDX_1:%.*]] = add nuw nsw i32 [[TMP15]], [[TMP7]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_1:%.*]] = add nuw nsw <16 x i32> [[TMP6]], [[TMP14]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]]
 ; CHECK-O3-NEXT:    [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP22:%.*]] = zext <16 x i16> [[TMP21]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP22]])
-; CHECK-O3-NEXT:    [[OP_RDX_2:%.*]] = add nuw nsw i32 [[TMP23]], [[OP_RDX_1]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_2:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_1]], [[TMP22]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
 ; CHECK-O3-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP30:%.*]] = zext <16 x i16> [[TMP29]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP31:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP30]])
-; CHECK-O3-NEXT:    [[OP_RDX_3:%.*]] = add nuw nsw i32 [[TMP31]], [[OP_RDX_2]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_3:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_2]], [[TMP30]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]]
 ; CHECK-O3-NEXT:    [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP38:%.*]] = zext <16 x i16> [[TMP37]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP38]])
-; CHECK-O3-NEXT:    [[OP_RDX_4:%.*]] = add nuw nsw i32 [[TMP39]], [[OP_RDX_3]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_4:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_3]], [[TMP38]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]]
 ; CHECK-O3-NEXT:    [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP46:%.*]] = zext <16 x i16> [[TMP45]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP47:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP46]])
-; CHECK-O3-NEXT:    [[OP_RDX_5:%.*]] = add nuw nsw i32 [[TMP47]], [[OP_RDX_4]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_5:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_4]], [[TMP46]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]]
 ; CHECK-O3-NEXT:    [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP54:%.*]] = zext <16 x i16> [[TMP53]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP55:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP54]])
-; CHECK-O3-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_6:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_5]], [[TMP54]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]]
 ; CHECK-O3-NEXT:    [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP62:%.*]] = zext <16 x i16> [[TMP61]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP63:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP62]])
-; CHECK-O3-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_7:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_6]], [[TMP62]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]]
 ; CHECK-O3-NEXT:    [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP70:%.*]] = zext <16 x i16> [[TMP69]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP71:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP70]])
-; CHECK-O3-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_8:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_7]], [[TMP70]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]]
 ; CHECK-O3-NEXT:    [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP78:%.*]] = zext <16 x i16> [[TMP77]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP79:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP78]])
-; CHECK-O3-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_9:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_8]], [[TMP78]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]]
 ; CHECK-O3-NEXT:    [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP86:%.*]] = zext <16 x i16> [[TMP85]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP87:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP86]])
-; CHECK-O3-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_10:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_9]], [[TMP86]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]]
 ; CHECK-O3-NEXT:    [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP94:%.*]] = zext <16 x i16> [[TMP93]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP95:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP94]])
-; CHECK-O3-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_11:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_10]], [[TMP94]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]]
 ; CHECK-O3-NEXT:    [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP102:%.*]] = zext <16 x i16> [[TMP101]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP103:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP102]])
-; CHECK-O3-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_12:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_11]], [[TMP102]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]]
 ; CHECK-O3-NEXT:    [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP110:%.*]] = zext <16 x i16> [[TMP109]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP111:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP110]])
-; CHECK-O3-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_13:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_12]], [[TMP110]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]]
 ; CHECK-O3-NEXT:    [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false)
 ; CHECK-O3-NEXT:    [[TMP118:%.*]] = zext <16 x i16> [[TMP117]] to <16 x i32>
-; CHECK-O3-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]])
-; CHECK-O3-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]]
+; CHECK-O3-NEXT:    [[VEC_SUM_NEXT_14:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_13]], [[TMP118]]
 ; CHECK-O3-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16>
+; CHECK-O3-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
 ; CHECK-O3-NEXT:    [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-O3-NEXT:    [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16>
 ; CHECK-O3-NEXT:    [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]]
 ; CHECK-O3-NEXT:    [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false)
-; CHECK-O3-NEXT:    [[TMP126:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP111:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT:    [[TMP126:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_14]], [[TMP111]]
 ; CHECK-O3-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP126]])
-; CHECK-O3-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[TMP127]], [[OP_RDX_14]]
-; CHECK-O3-NEXT:    ret i32 [[OP_RDX_15]]
+; CHECK-O3-NEXT:    ret i32 [[TMP127]]
 ;
 ; CHECK-LTO-LABEL: define dso_local i32 @test(
 ; CHECK-LTO-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-LTO-NEXT:  [[ENTRY:.*:]]
-; CHECK-LTO-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
-; CHECK-LTO-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-LTO-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
 ; CHECK-LTO-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
 ; CHECK-LTO-NEXT:    [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP36:%.*]] = zext nneg <16 x i16> [[TMP5]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]])
+; CHECK-LTO-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
 ; CHECK-LTO-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
+; CHECK-LTO-NEXT:    [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]]
 ; CHECK-LTO-NEXT:    [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP52:%.*]] = zext nneg <16 x i16> [[TMP11]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP60:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP52]])
-; CHECK-LTO-NEXT:    [[OP_RDX_1:%.*]] = add nuw nsw i32 [[TMP60]], [[TMP44]]
+; CHECK-LTO-NEXT:    [[NARROW:%.*]] = add nuw nsw <16 x i16> [[TMP11]], [[TMP5]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]]
 ; CHECK-LTO-NEXT:    [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP68:%.*]] = zext nneg <16 x i16> [[TMP17]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP76:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]])
-; CHECK-LTO-NEXT:    [[OP_RDX_2:%.*]] = add nuw nsw i32 [[OP_RDX_1]], [[TMP76]]
+; CHECK-LTO-NEXT:    [[NARROW15:%.*]] = add nuw nsw <16 x i16> [[NARROW]], [[TMP17]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]]
 ; CHECK-LTO-NEXT:    [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP84:%.*]] = zext nneg <16 x i16> [[TMP23]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP92:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]])
-; CHECK-LTO-NEXT:    [[OP_RDX_3:%.*]] = add nuw nsw i32 [[OP_RDX_2]], [[TMP92]]
+; CHECK-LTO-NEXT:    [[NARROW16:%.*]] = add nuw nsw <16 x i16> [[NARROW15]], [[TMP23]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
 ; CHECK-LTO-NEXT:    [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP100:%.*]] = zext nneg <16 x i16> [[TMP29]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP108:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP100]])
-; CHECK-LTO-NEXT:    [[OP_RDX_4:%.*]] = add nuw nsw i32 [[OP_RDX_3]], [[TMP108]]
+; CHECK-LTO-NEXT:    [[NARROW17:%.*]] = add nuw nsw <16 x i16> [[NARROW16]], [[TMP29]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]]
 ; CHECK-LTO-NEXT:    [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true)
-; CHECK-LTO-NEXT:    [[TMP116:%.*]] = zext nneg <16 x i16> [[TMP35]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP117:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP116]])
-; CHECK-LTO-NEXT:    [[OP_RDX_5:%.*]] = add nuw nsw i32 [[OP_RDX_4]], [[TMP117]]
+; CHECK-LTO-NEXT:    [[NARROW18:%.*]] = add nuw nsw <16 x i16> [[NARROW17]], [[TMP35]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]]
-; CHECK-LTO-NEXT:    [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP44:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
+; CHECK-LTO-NEXT:    [[TMP42:%.*]] = add nuw nsw <16 x i16> [[TMP44]], [[NARROW18]]
 ; CHECK-LTO-NEXT:    [[TMP43:%.*]] = zext nneg <16 x i16> [[TMP42]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP118:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP43]])
-; CHECK-LTO-NEXT:    [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]]
 ; CHECK-LTO-NEXT:    [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP51:%.*]] = zext nneg <16 x i16> [[TMP50]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP120:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP51]])
-; CHECK-LTO-NEXT:    [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_7:%.*]] = add nuw nsw <16 x i32> [[TMP43]], [[TMP51]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]]
 ; CHECK-LTO-NEXT:    [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP59:%.*]] = zext nneg <16 x i16> [[TMP58]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP121:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP59]])
-; CHECK-LTO-NEXT:    [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_8:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_7]], [[TMP59]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]]
 ; CHECK-LTO-NEXT:    [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP67:%.*]] = zext nneg <16 x i16> [[TMP66]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP122:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP67]])
-; CHECK-LTO-NEXT:    [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_9:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_8]], [[TMP67]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]]
 ; CHECK-LTO-NEXT:    [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP75:%.*]] = zext nneg <16 x i16> [[TMP74]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP123:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP75]])
-; CHECK-LTO-NEXT:    [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_10:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_9]], [[TMP75]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]]
 ; CHECK-LTO-NEXT:    [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP83:%.*]] = zext nneg <16 x i16> [[TMP82]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP124:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP83]])
-; CHECK-LTO-NEXT:    [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_11:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_10]], [[TMP83]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]]
 ; CHECK-LTO-NEXT:    [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP91:%.*]] = zext nneg <16 x i16> [[TMP90]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP125:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP91]])
-; CHECK-LTO-NEXT:    [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_12:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_11]], [[TMP91]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]]
 ; CHECK-LTO-NEXT:    [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP99:%.*]] = zext nneg <16 x i16> [[TMP98]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP126:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]])
-; CHECK-LTO-NEXT:    [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_13:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_12]], [[TMP99]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]]
 ; CHECK-LTO-NEXT:    [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP107:%.*]] = zext nneg <16 x i16> [[TMP106]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP107]])
-; CHECK-LTO-NEXT:    [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_14:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_13]], [[TMP107]]
 ; CHECK-LTO-NEXT:    [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16>
+; CHECK-LTO-NEXT:    [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
 ; CHECK-LTO-NEXT:    [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
 ; CHECK-LTO-NEXT:    [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16>
 ; CHECK-LTO-NEXT:    [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]]
 ; CHECK-LTO-NEXT:    [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true)
 ; CHECK-LTO-NEXT:    [[TMP115:%.*]] = zext nneg <16 x i16> [[TMP114]] to <16 x i32>
-; CHECK-LTO-NEXT:    [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]])
-; CHECK-LTO-NEXT:    [[OP_RDX_15:%.*]] = add i32 [[OP_RDX_14]], [[TMP127]]
+; CHECK-LTO-NEXT:    [[VEC_SUM_NEXT_15:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_14]], [[TMP115]]
+; CHECK-LTO-NEXT:    [[OP_RDX_15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[VEC_SUM_NEXT_15]])
 ; CHECK-LTO-NEXT:    ret i32 [[OP_RDX_15]]
 ;
 entry: