[llvm] [RISCV] Improve loop by extract reduction instruction (PR #179215)

Sun Feb 8 23:06:33 PST 2026

https://github.com/Anjian-Wen updated https://github.com/llvm/llvm-project/pull/179215

>From ef12d82a225c2475e3a9af50a01e7dcee07e6114 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 2 Feb 2026 19:36:42 +0800
Subject: [PATCH 1/5] [RISCV] Improve loop by extract reduction instruction
 with vector_reduce_add in some pattern

---
 llvm/include/llvm/CodeGen/Passes.h            |   4 +
 .../Transforms/Vectorize/LoopReduceMotion.h   |  24 ++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   1 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Transforms/Vectorize/LoopReduceMotion.cpp | 212 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 .../loop-reduce-motion-test.ll                |  72 ++++++
 9 files changed, 317 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
 create mode 100644 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
 create mode 100644 llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2717110e1b3e7..2bd8e843f8c13 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,6 +629,10 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
 
 /// Lowers KCFI operand bundles for indirect calls.
 LLVM_ABI FunctionPass *createKCFIPass();
+
+/// This pass is designed to hoist ReduceCall operations out of loops to
+/// reduce the number of instructions within the loop body.
+LLVM_ABI FunctionPass *createLoopReduceMotionPass();
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
new file mode 100644
index 0000000000000..17bd74472700a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -0,0 +1,24 @@
+//===- LoopReduceMotion.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+namespace llvm {
+class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 45955426d66a0..e25868697a030 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -379,6 +379,7 @@
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2cfb5b2592601..edd0962052cbf 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -553,6 +553,7 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 7eb56f52c2e66..303a502be8cf9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,6 +479,7 @@ void RISCVPassConfig::addIRPasses() {
     addPass(createRISCVGatherScatterLoweringPass());
     addPass(createInterleavedAccessPass());
     addPass(createRISCVCodeGenPrepareLegacyPass());
+    addPass(createLoopReduceMotionPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9f4a242214471..406031876a7d0 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize
   LoopIdiomVectorize.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
+  LoopReduceMotion.cpp
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/InstrMaps.cpp
   SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
new file mode 100644
index 0000000000000..33334a2acfa78
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -0,0 +1,212 @@
+//===-------- LoopReduceMotion.cpp - Loop Reduce Motion Optimization ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+// Below are the target pattern to be matched and the resulting pattern
+// after the transformation.
+//
+// before                    | after
+// ------                    | ------
+// loop:                     | loop:
+//   ...                     |   ...
+//   vc = vecbin va, vb      |   vc = vecbin va, vb
+//   d = reduce_add vc       |   vsum = vadd vsum, vc
+//   sum = add sum, d        |   ...
+//   ...                     |   ...
+// exit:                     | exit:
+//   value = sum             |   d = reduce_add sum
+//   ...                     |   value = d
+//   ...                     |   ...
+//   ret                     |   ret
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-reduce-motion"
+
+using namespace llvm;
+
+class LoopReduceMotion : public FunctionPass {
+  LoopReduceMotionPass Impl;
+
+public:
+  static char ID;
+
+  LoopReduceMotion() : FunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+char LoopReduceMotion::ID = 0;
+
+PreservedAnalyses LoopReduceMotionPass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  bool Changed = false;
+  for (Loop *L : LI) {
+    Changed |= matchAndTransform(*L, DT, LI);
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+
+bool LoopReduceMotion::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  bool Changed = false;
+  for (Loop *L : LI) {
+    Changed |= Impl.matchAndTransform(*L, DT, LI);
+  }
+  if (!Changed)
+    return false;
+
+  return true;
+}
+
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
+                                             LoopInfo &LI) {
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Latch = L.getLoopLatch();
+  BasicBlock *ExitBlock = L.getExitBlock();
+  if (!Header || !Latch || !ExitBlock) {
+    LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
+                      << " because it is not a valid loop.\n");
+    return false;
+  }
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+    if (!Preheader) {
+      LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
+                        << Header->getName() << ".\n");
+      return false;
+    }
+  }
+  for (PHINode &PN : Header->phis()) {
+    if (!PN.getType()->isIntegerTy())
+      continue;
+
+    RecurrenceDescriptor RecDesc;
+    if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+      continue;
+
+    if (RecDesc.getRecurrenceKind() != RecurKind::Add)
+      continue;
+
+    Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+    Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
+    if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
+      continue;
+
+    Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+                                 ? RecurrenceInst->getOperand(1)
+                                 : RecurrenceInst->getOperand(0);
+
+    CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+    if (!ReduceCall)
+      continue;
+    Function *CalledFunc = ReduceCall->getCalledFunction();
+
+    if (!CalledFunc || !CalledFunc->isIntrinsic() ||
+        !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
+      continue;
+
+    Value *ReduceOperand = ReduceCall->getArgOperand(0);
+    Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
+    if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
+                    VecBin->getOpcode() != Instruction::Add))
+      continue;
+    // pattern match success
+    LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+                      << Header->getName() << "!\n");
+
+    VectorType *VecTy = cast<VectorType>(VecBin->getType());
+    IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
+
+    Value *VecZero = PreheaderBuilder.CreateVectorSplat(
+        VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
+        "vec.zero");
+
+    // build new Vector Add to replace Scalar Add
+    IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
+    PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
+    VecSumPhi->addIncoming(VecZero, Preheader);
+    IRBuilder<> BodyBuilder(RecurrenceInst);
+    Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+    VecSumPhi->addIncoming(NewVecAdd, Latch);
+
+    // build landingPad for reduce add out of loop
+    BasicBlock *ExitingBlock =
+        Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
+    if (!L.isLoopExiting(ExitingBlock)) {
+      ExitingBlock = Header;
+    }
+    BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
+    LandingPad->setName("loop.exit.landing");
+    IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
+    Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
+        ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
+    Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+    Value *LastAdd =
+        PreheaderValue
+            ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
+            : ScalarTotalSum;
+
+    // replace use of phi and erase use empty value
+    if (!PN.use_empty())
+      PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+    if (PN.use_empty())
+      PN.eraseFromParent();
+    RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+    if (RecurrenceInst->use_empty())
+      RecurrenceInst->eraseFromParent();
+    if (ReduceCall->use_empty())
+      ReduceCall->eraseFromParent();
+
+    return true;
+  }
+  return false;
+}
+
+FunctionPass *llvm::createLoopReduceMotionPass() {
+  return new LoopReduceMotion();
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3b63c1d86d3b1..945b9bf7fd3e0 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,6 +39,7 @@
 ; CHECK-NEXT:       RISC-V gather/scatter lowering
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       RISC-V CodeGenPrepare
+; CHECK-NEXT:       Loop Reduce Motion Pass
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
new file mode 100644
index 0000000000000..dfe6b1ddb1f36
--- /dev/null
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; loop-reduce-motion-test.ll
+; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
+
+define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
+; CHECK-LABEL: define i32 @pixel_asd8(
+; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK:       [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %height, 0
+  br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exitcond.not = icmp eq i32 %inc9, %height
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}

>From 805a89422c428c87b578ff59b2b92e475480d0d7 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Tue, 3 Feb 2026 20:03:38 +0800
Subject: [PATCH 2/5] Fix format

---
 .../Transforms/Vectorize/LoopReduceMotion.cpp |  2 +-
 .../loop-reduce-motion-test.ll                | 58 +++++++++----------
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index 33334a2acfa78..dc9a1223bae02 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -193,7 +193,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
 
     // replace use of phi and erase use empty value
     if (!PN.use_empty())
-      PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+      PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
     if (PN.use_empty())
       PN.eraseFromParent();
     RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index dfe6b1ddb1f36..73bea9d6623e8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -1,20 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; loop-reduce-motion-test.ll
 ; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
 
-define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
-; CHECK-LABEL: define i32 @pixel_asd8(
-; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
-; CHECK-NEXT:    br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
-; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @pixel_asd8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -22,33 +21,30 @@ define  i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
-; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
-; CHECK:       [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       loop.exit.landing:
 ; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   %cmp21 = icmp sgt i32 %height, 0
-  br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
-
-for.cond1.preheader.preheader:                    ; preds = %entry
-  br label %for.cond1.preheader
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
 
 for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
-  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
-  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
-  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
-  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
   %0 = load <8 x i8>, ptr %pix1.addr.023
   %1 = load <8 x i8>, ptr %pix2.addr.022
   %2 = zext <8 x i8> %0 to <8 x i32>

>From 2134c689883bb02affee9bc453a19876d58794ff Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Wed, 4 Feb 2026 19:26:03 +0800
Subject: [PATCH 3/5] Fix some format and possible nullptr dyn_cast

---
 .../Transforms/Vectorize/LoopReduceMotion.h     |  6 ++++--
 llvm/lib/Passes/PassRegistry.def                |  2 +-
 llvm/lib/Transforms/Vectorize/CMakeLists.txt    |  2 +-
 .../Transforms/Vectorize/LoopReduceMotion.cpp   | 17 +++++++++--------
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index 17bd74472700a..df5af76819923 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
 // the number of instructions within the loop body.
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
 #define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
+
 namespace llvm {
 class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
 public:
@@ -21,4 +23,4 @@ class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
   bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
 };
 } // namespace llvm
-#endif
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index edd0962052cbf..c896d7c99c107 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,6 +469,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -553,7 +554,6 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 406031876a7d0..0fa532010632b 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_llvm_component_library(LLVMVectorize
   LoadStoreVectorizer.cpp
   LoopIdiomVectorize.cpp
+  LoopReduceMotion.cpp
   LoopVectorizationLegality.cpp
   LoopVectorize.cpp
-  LoopReduceMotion.cpp
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/InstrMaps.cpp
   SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index dc9a1223bae02..fdefcb7e00074 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -5,10 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
 // the number of instructions within the loop body.
 //
-// Below are the target pattern to be matched and the resulting pattern
+// Below is the target pattern to be matched and the resulting pattern
 // after the transformation.
 //
 // before                    | after
@@ -160,11 +160,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
                       << Header->getName() << "!\n");
 
     VectorType *VecTy = cast<VectorType>(VecBin->getType());
-    IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
-
-    Value *VecZero = PreheaderBuilder.CreateVectorSplat(
-        VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
-        "vec.zero");
+    Value *VecZero = ConstantInt::get(VecTy, 0);
 
     // build new Vector Add to replace Scalar Add
     IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
@@ -196,7 +192,12 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
       PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
     if (PN.use_empty())
       PN.eraseFromParent();
-    RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+
+    Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
+    if (!FinalNode)
+      return false;
+    RecurrenceInst->replaceAllUsesWith(FinalNode);
+
     if (RecurrenceInst->use_empty())
       RecurrenceInst->eraseFromParent();
     if (ReduceCall->use_empty())

>From de183c53008b7f4a28aa1cd93c43fb48bd9d96d6 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Feb 2026 19:25:25 +0800
Subject: [PATCH 4/5] delete some wrong log

---
 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index fdefcb7e00074..f9b45ade53676 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -156,7 +156,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
                     VecBin->getOpcode() != Instruction::Add))
       continue;
     // pattern match success
-    LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+    LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
                       << Header->getName() << "!\n");
 
     VectorType *VecTy = cast<VectorType>(VecBin->getType());

>From ac4e3131d3d015a24e8e61c178ceb40d5a693a3b Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 15:02:58 +0800
Subject: [PATCH 5/5] put the pass after VectorCombine add some test negative
 tests change class name in case of confused change Node delete func to
 RecursivelyDelete functions

---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   3 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   1 -
 .../Transforms/Vectorize/LoopReduceMotion.cpp |  23 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 -
 llvm/test/Other/new-pm-defaults.ll            |   1 +
 llvm/test/Other/new-pm-lto-defaults.ll        |   1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |   1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |   1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |   1 +
 .../loop-reduce-motion-test.ll                | 272 +++++++++++++++++-
 10 files changed, 287 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1584d30875570..8579da78be8c8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -147,6 +147,7 @@
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
 #include "llvm/Transforms/Utils/RelLookupTableConverter.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
@@ -1418,6 +1419,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   }
   // Enhance/cleanup vector code.
   FPM.addPass(VectorCombinePass());
+  // Try to sink ReduceCall out of loop
+  FPM.addPass(LoopReduceMotionPass());
 
   if (!IsFullLTO) {
     FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 303a502be8cf9..7eb56f52c2e66 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,7 +479,6 @@ void RISCVPassConfig::addIRPasses() {
     addPass(createRISCVGatherScatterLoweringPass());
     addPass(createInterleavedAccessPass());
     addPass(createRISCVCodeGenPrepareLegacyPass());
-    addPass(createLoopReduceMotionPass());
   }
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f9b45ade53676..f56a546c5ceb9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -46,13 +46,13 @@
 
 using namespace llvm;
 
-class LoopReduceMotion : public FunctionPass {
+class LoopReduceMotionLegacy : public FunctionPass {
   LoopReduceMotionPass Impl;
 
 public:
   static char ID;
 
-  LoopReduceMotion() : FunctionPass(ID) {}
+  LoopReduceMotionLegacy() : FunctionPass(ID) {}
 
   StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
 
@@ -65,7 +65,7 @@ class LoopReduceMotion : public FunctionPass {
   }
 };
 
-char LoopReduceMotion::ID = 0;
+char LoopReduceMotionLegacy::ID = 0;
 
 PreservedAnalyses LoopReduceMotionPass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
@@ -80,7 +80,7 @@ PreservedAnalyses LoopReduceMotionPass::run(Function &F,
   return PreservedAnalyses::none();
 }
 
-bool LoopReduceMotion::runOnFunction(Function &F) {
+bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
@@ -187,21 +187,16 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
             ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
             : ScalarTotalSum;
 
-    // replace use of phi and erase use empty value
+    // delete the dead PHI Node
     if (!PN.use_empty())
       PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
-    if (PN.use_empty())
-      PN.eraseFromParent();
-
+    llvm::RecursivelyDeleteDeadPHINode(&PN);
+    // replace the use of Recurrence Node and delete the dead Node
     Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
     if (!FinalNode)
       return false;
     RecurrenceInst->replaceAllUsesWith(FinalNode);
-
-    if (RecurrenceInst->use_empty())
-      RecurrenceInst->eraseFromParent();
-    if (ReduceCall->use_empty())
-      ReduceCall->eraseFromParent();
+    llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
 
     return true;
   }
@@ -209,5 +204,5 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
 }
 
 FunctionPass *llvm::createLoopReduceMotionPass() {
-  return new LoopReduceMotion();
+  return new LoopReduceMotionLegacy();
 }
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 945b9bf7fd3e0..3b63c1d86d3b1 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,7 +39,6 @@
 ; CHECK-NEXT:       RISC-V gather/scatter lowering
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       RISC-V CodeGenPrepare
-; CHECK-NEXT:       Loop Reduce Motion Pass
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index f074b2fdd3ab8..9af05eb49e65e 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index de0feca55e5b2..cebbe671e17b3 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,6 +143,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index b0d08316de4f0..4065cead7c264 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,7 @@
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 6b3e82a752899..126caf7eed3ab 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 88dc18f605ce2..e9879a512a9b9 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,7 @@
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 73bea9d6623e8..859a19e701fa8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -2,8 +2,8 @@
 ; loop-reduce-motion-test.ll
 ; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
 
-define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @pixel_asd8(
+define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_Sub(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -66,3 +66,271 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
   ret i32 %6
 }
+
+define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       loop.exit.landing:
+; CHECK-NEXT:    [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %height, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = add nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exitcond.not = icmp eq i32 %inc9, %height
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}
+
+define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
+; CHECK-LABEL: @multi_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    br label [[NEXT_COND0:%.*]]
+; CHECK:       next.cond0:
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
+; CHECK-NEXT:    [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK:       next.cond1:
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
+; CHECK-NEXT:    [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
+; CHECK-NEXT:    br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %next.cond1 ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %next.cond1 ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %next.cond1 ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %next.cond1 ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  br label %next.cond0
+
+next.cond0:
+  %add.8 = add i32 %add.7, 1
+  %exit1 = icmp eq i32 %inc9, %val1
+  br i1 %exit1, label %for.cond.cleanup, label %next.cond1
+
+next.cond1:
+  %add.9 = add i32 %add.7, 2
+  %exit2 = icmp eq i32 %inc9, %val2
+  br i1 %exit2, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.8, %next.cond0 ], [%add.9, %next.cond1 ]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  ret i32 %6
+}
+
+define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @phi_not_reduction_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = add i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exit = icmp eq i32 %inc9, %val1
+  br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+  %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  %7 = add i32 %6, %sum
+  ret i32 %7
+}
+
+define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @reduction_call_not_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT:    [[ADD_7]] = sub i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT:    [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+entry:
+  %cmp21 = icmp sgt i32 %val1, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader
+  %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+  %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+  %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+  %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+  %0 = load <8 x i8>, ptr %pix1.addr.023
+  %1 = load <8 x i8>, ptr %pix2.addr.022
+  %2 = zext <8 x i8> %0 to <8 x i32>
+  %3 = zext <8 x i8> %1 to <8 x i32>
+  %4 = sub nsw <8 x i32> %2, %3
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  %add.7 = sub i32 %sum.024, %5
+  %inc9 = add nuw nsw i32 %y.025, 1
+  %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+  %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+  %exit = icmp eq i32 %inc9, %val1
+  br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.preheader
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+  %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+  %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+  %7 = add i32 %6, %sum
+  ret i32 %7
+}