[llvm] [RISCV] Improve loop by extract reduction instruction (PR #179215)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 8 23:06:33 PST 2026
https://github.com/Anjian-Wen updated https://github.com/llvm/llvm-project/pull/179215
>From ef12d82a225c2475e3a9af50a01e7dcee07e6114 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 2 Feb 2026 19:36:42 +0800
Subject: [PATCH 1/5] [RISCV] Improve loop by extract reduction instruction
with vector_reduce_add in some pattern
---
llvm/include/llvm/CodeGen/Passes.h | 4 +
.../Transforms/Vectorize/LoopReduceMotion.h | 24 ++
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 1 +
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 +
.../Transforms/Vectorize/LoopReduceMotion.cpp | 212 ++++++++++++++++++
llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 +
.../loop-reduce-motion-test.ll | 72 ++++++
9 files changed, 317 insertions(+)
create mode 100644 llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
create mode 100644 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
create mode 100644 llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2717110e1b3e7..2bd8e843f8c13 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,6 +629,10 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
/// Lowers KCFI operand bundles for indirect calls.
LLVM_ABI FunctionPass *createKCFIPass();
+
+/// This pass is designed to hoist ReduceCall operations out of loops to
+/// reduce the number of instructions within the loop body.
+LLVM_ABI FunctionPass *createLoopReduceMotionPass();
} // namespace llvm
#endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
new file mode 100644
index 0000000000000..17bd74472700a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -0,0 +1,24 @@
+//===- LoopReduceMotion.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+namespace llvm {
+class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 45955426d66a0..e25868697a030 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -379,6 +379,7 @@
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2cfb5b2592601..edd0962052cbf 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -553,6 +553,7 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("verify", VerifierPass())
FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 7eb56f52c2e66..303a502be8cf9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,6 +479,7 @@ void RISCVPassConfig::addIRPasses() {
addPass(createRISCVGatherScatterLoweringPass());
addPass(createInterleavedAccessPass());
addPass(createRISCVCodeGenPrepareLegacyPass());
+ addPass(createLoopReduceMotionPass());
}
TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9f4a242214471..406031876a7d0 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize
LoopIdiomVectorize.cpp
LoopVectorizationLegality.cpp
LoopVectorize.cpp
+ LoopReduceMotion.cpp
SandboxVectorizer/DependencyGraph.cpp
SandboxVectorizer/InstrMaps.cpp
SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
new file mode 100644
index 0000000000000..33334a2acfa78
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -0,0 +1,212 @@
+//===-------- LoopReduceMotion.cpp - Loop Reduce Motion Optimization ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+// Below are the target pattern to be matched and the resulting pattern
+// after the transformation.
+//
+// before | after
+// ------ | ------
+// loop: | loop:
+// ... | ...
+// vc = vecbin va, vb | vc = vecbin va, vb
+// d = reduce_add vc | vsum = vadd vsum, vc
+// sum = add sum, d | ...
+// ... | ...
+// exit: | exit:
+// value = sum | d = reduce_add sum
+// ... | value = d
+// ... | ...
+// ret | ret
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-reduce-motion"
+
+using namespace llvm;
+
+class LoopReduceMotion : public FunctionPass {
+ LoopReduceMotionPass Impl;
+
+public:
+ static char ID;
+
+ LoopReduceMotion() : FunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+char LoopReduceMotion::ID = 0;
+
+PreservedAnalyses LoopReduceMotionPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ bool Changed = false;
+ for (Loop *L : LI) {
+ Changed |= matchAndTransform(*L, DT, LI);
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+bool LoopReduceMotion::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ bool Changed = false;
+ for (Loop *L : LI) {
+ Changed |= Impl.matchAndTransform(*L, DT, LI);
+ }
+ if (!Changed)
+ return false;
+
+ return true;
+}
+
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
+ LoopInfo &LI) {
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Latch = L.getLoopLatch();
+ BasicBlock *ExitBlock = L.getExitBlock();
+ if (!Header || !Latch || !ExitBlock) {
+ LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
+ << " because it is not a valid loop.\n");
+ return false;
+ }
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
+ << Header->getName() << ".\n");
+ return false;
+ }
+ }
+ for (PHINode &PN : Header->phis()) {
+ if (!PN.getType()->isIntegerTy())
+ continue;
+
+ RecurrenceDescriptor RecDesc;
+ if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+ continue;
+
+ if (RecDesc.getRecurrenceKind() != RecurKind::Add)
+ continue;
+
+ Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+ Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
+ if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
+ continue;
+
+ Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+ ? RecurrenceInst->getOperand(1)
+ : RecurrenceInst->getOperand(0);
+
+ CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+ if (!ReduceCall)
+ continue;
+ Function *CalledFunc = ReduceCall->getCalledFunction();
+
+ if (!CalledFunc || !CalledFunc->isIntrinsic() ||
+ !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
+ continue;
+
+ Value *ReduceOperand = ReduceCall->getArgOperand(0);
+ Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
+ if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
+ VecBin->getOpcode() != Instruction::Add))
+ continue;
+ // pattern match success
+ LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+ << Header->getName() << "!\n");
+
+ VectorType *VecTy = cast<VectorType>(VecBin->getType());
+ IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
+
+ Value *VecZero = PreheaderBuilder.CreateVectorSplat(
+ VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
+ "vec.zero");
+
+ // build new Vector Add to replace Scalar Add
+ IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
+ PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
+ VecSumPhi->addIncoming(VecZero, Preheader);
+ IRBuilder<> BodyBuilder(RecurrenceInst);
+ Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+ VecSumPhi->addIncoming(NewVecAdd, Latch);
+
+ // build landingPad for reduce add out of loop
+ BasicBlock *ExitingBlock =
+ Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
+ if (!L.isLoopExiting(ExitingBlock)) {
+ ExitingBlock = Header;
+ }
+ BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
+ LandingPad->setName("loop.exit.landing");
+ IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
+ Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
+ ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
+ Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+ Value *LastAdd =
+ PreheaderValue
+ ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
+ : ScalarTotalSum;
+
+ // replace use of phi and erase use empty value
+ if (!PN.use_empty())
+ PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+ if (PN.use_empty())
+ PN.eraseFromParent();
+ RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+ if (RecurrenceInst->use_empty())
+ RecurrenceInst->eraseFromParent();
+ if (ReduceCall->use_empty())
+ ReduceCall->eraseFromParent();
+
+ return true;
+ }
+ return false;
+}
+
+FunctionPass *llvm::createLoopReduceMotionPass() {
+ return new LoopReduceMotion();
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3b63c1d86d3b1..945b9bf7fd3e0 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,6 +39,7 @@
; CHECK-NEXT: RISC-V gather/scatter lowering
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: RISC-V CodeGenPrepare
+; CHECK-NEXT: Loop Reduce Motion Pass
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Canonicalize natural loops
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
new file mode 100644
index 0000000000000..dfe6b1ddb1f36
--- /dev/null
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; loop-reduce-motion-test.ll
+; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
+
+define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
+; CHECK-LABEL: define i32 @pixel_asd8(
+; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK: [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK: [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %height, 0
+ br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.preheader: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exitcond.not = icmp eq i32 %inc9, %height
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
>From 805a89422c428c87b578ff59b2b92e475480d0d7 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Tue, 3 Feb 2026 20:03:38 +0800
Subject: [PATCH 2/5] Fix format
---
.../Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
.../loop-reduce-motion-test.ll | 58 +++++++++----------
2 files changed, 28 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index 33334a2acfa78..dc9a1223bae02 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -193,7 +193,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
// replace use of phi and erase use empty value
if (!PN.use_empty())
- PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+ PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
if (PN.use_empty())
PN.eraseFromParent();
RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index dfe6b1ddb1f36..73bea9d6623e8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -1,20 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; loop-reduce-motion-test.ll
; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
-define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
-; CHECK-LABEL: define i32 @pixel_asd8(
-; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
-; CHECK-NEXT: br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
-; CHECK: [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @pixel_asd8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -22,33 +21,30 @@ define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
-; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
-; CHECK: [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: loop.exit.landing:
; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
-; CHECK-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
-; CHECK: [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%cmp21 = icmp sgt i32 %height, 0
- br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
-
-for.cond1.preheader.preheader: ; preds = %entry
- br label %for.cond1.preheader
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
- %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
- %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
- %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
- %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
%0 = load <8 x i8>, ptr %pix1.addr.023
%1 = load <8 x i8>, ptr %pix2.addr.022
%2 = zext <8 x i8> %0 to <8 x i32>
>From 2134c689883bb02affee9bc453a19876d58794ff Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Wed, 4 Feb 2026 19:26:03 +0800
Subject: [PATCH 3/5] Fix some format and possible nullptr dyn_cast
---
.../Transforms/Vectorize/LoopReduceMotion.h | 6 ++++--
llvm/lib/Passes/PassRegistry.def | 2 +-
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 2 +-
.../Transforms/Vectorize/LoopReduceMotion.cpp | 17 +++++++++--------
4 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index 17bd74472700a..df5af76819923 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
// the number of instructions within the loop body.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/PassManager.h"
+
namespace llvm {
class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
public:
@@ -21,4 +23,4 @@ class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
};
} // namespace llvm
-#endif
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index edd0962052cbf..c896d7c99c107 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,6 +469,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
FUNCTION_PASS("loop-distribute", LoopDistributePass())
FUNCTION_PASS("loop-fusion", LoopFusePass())
FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
FUNCTION_PASS("loop-sink", LoopSinkPass())
FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -553,7 +554,6 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
FUNCTION_PASS("vector-combine", VectorCombinePass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("verify", VerifierPass())
FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 406031876a7d0..0fa532010632b 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,9 @@
add_llvm_component_library(LLVMVectorize
LoadStoreVectorizer.cpp
LoopIdiomVectorize.cpp
+ LoopReduceMotion.cpp
LoopVectorizationLegality.cpp
LoopVectorize.cpp
- LoopReduceMotion.cpp
SandboxVectorizer/DependencyGraph.cpp
SandboxVectorizer/InstrMaps.cpp
SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index dc9a1223bae02..fdefcb7e00074 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -5,10 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
// the number of instructions within the loop body.
//
-// Below are the target pattern to be matched and the resulting pattern
+// Below is the target pattern to be matched and the resulting pattern
// after the transformation.
//
// before | after
@@ -160,11 +160,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
<< Header->getName() << "!\n");
VectorType *VecTy = cast<VectorType>(VecBin->getType());
- IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
-
- Value *VecZero = PreheaderBuilder.CreateVectorSplat(
- VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
- "vec.zero");
+ Value *VecZero = ConstantInt::get(VecTy, 0);
// build new Vector Add to replace Scalar Add
IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
@@ -196,7 +192,12 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
if (PN.use_empty())
PN.eraseFromParent();
- RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+
+ Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
+ if (!FinalNode)
+ return false;
+ RecurrenceInst->replaceAllUsesWith(FinalNode);
+
if (RecurrenceInst->use_empty())
RecurrenceInst->eraseFromParent();
if (ReduceCall->use_empty())
>From de183c53008b7f4a28aa1cd93c43fb48bd9d96d6 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Feb 2026 19:25:25 +0800
Subject: [PATCH 4/5] delete some wrong log
---
llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index fdefcb7e00074..f9b45ade53676 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -156,7 +156,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
VecBin->getOpcode() != Instruction::Add))
continue;
// pattern match success
- LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+ LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
<< Header->getName() << "!\n");
VectorType *VecTy = cast<VectorType>(VecBin->getType());
>From ac4e3131d3d015a24e8e61c178ceb40d5a693a3b Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 15:02:58 +0800
Subject: [PATCH 5/5] put the pass after VectorCombine add some test negative
tests change class name in case of confused change Node delete func to
RecursivelyDelete functions
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 3 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 1 -
.../Transforms/Vectorize/LoopReduceMotion.cpp | 23 +-
llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 -
llvm/test/Other/new-pm-defaults.ll | 1 +
llvm/test/Other/new-pm-lto-defaults.ll | 1 +
.../Other/new-pm-thinlto-postlink-defaults.ll | 1 +
.../new-pm-thinlto-postlink-pgo-defaults.ll | 1 +
...-pm-thinlto-postlink-samplepgo-defaults.ll | 1 +
.../loop-reduce-motion-test.ll | 272 +++++++++++++++++-
10 files changed, 287 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1584d30875570..8579da78be8c8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -147,6 +147,7 @@
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/VectorCombine.h"
@@ -1418,6 +1419,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
}
// Enhance/cleanup vector code.
FPM.addPass(VectorCombinePass());
+ // Try to sink ReduceCall out of loop
+ FPM.addPass(LoopReduceMotionPass());
if (!IsFullLTO) {
FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 303a502be8cf9..7eb56f52c2e66 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,7 +479,6 @@ void RISCVPassConfig::addIRPasses() {
addPass(createRISCVGatherScatterLoweringPass());
addPass(createInterleavedAccessPass());
addPass(createRISCVCodeGenPrepareLegacyPass());
- addPass(createLoopReduceMotionPass());
}
TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f9b45ade53676..f56a546c5ceb9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -46,13 +46,13 @@
using namespace llvm;
-class LoopReduceMotion : public FunctionPass {
+class LoopReduceMotionLegacy : public FunctionPass {
LoopReduceMotionPass Impl;
public:
static char ID;
- LoopReduceMotion() : FunctionPass(ID) {}
+ LoopReduceMotionLegacy() : FunctionPass(ID) {}
StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
@@ -65,7 +65,7 @@ class LoopReduceMotion : public FunctionPass {
}
};
-char LoopReduceMotion::ID = 0;
+char LoopReduceMotionLegacy::ID = 0;
PreservedAnalyses LoopReduceMotionPass::run(Function &F,
FunctionAnalysisManager &FAM) {
@@ -80,7 +80,7 @@ PreservedAnalyses LoopReduceMotionPass::run(Function &F,
return PreservedAnalyses::none();
}
-bool LoopReduceMotion::runOnFunction(Function &F) {
+bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -187,21 +187,16 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
: ScalarTotalSum;
- // replace use of phi and erase use empty value
+ // delete the dead PHI Node
if (!PN.use_empty())
PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
- if (PN.use_empty())
- PN.eraseFromParent();
-
+ llvm::RecursivelyDeleteDeadPHINode(&PN);
+ // replace the use of Recurrence Node and delete the dead Node
Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
if (!FinalNode)
return false;
RecurrenceInst->replaceAllUsesWith(FinalNode);
-
- if (RecurrenceInst->use_empty())
- RecurrenceInst->eraseFromParent();
- if (ReduceCall->use_empty())
- ReduceCall->eraseFromParent();
+ llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
return true;
}
@@ -209,5 +204,5 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
}
FunctionPass *llvm::createLoopReduceMotionPass() {
- return new LoopReduceMotion();
+ return new LoopReduceMotionLegacy();
}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 945b9bf7fd3e0..3b63c1d86d3b1 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,7 +39,6 @@
; CHECK-NEXT: RISC-V gather/scatter lowering
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: RISC-V CodeGenPrepare
-; CHECK-NEXT: Loop Reduce Motion Pass
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Canonicalize natural loops
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index f074b2fdd3ab8..9af05eb49e65e 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index de0feca55e5b2..cebbe671e17b3 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,6 +143,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index b0d08316de4f0..4065cead7c264 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,7 @@
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 6b3e82a752899..126caf7eed3ab 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 88dc18f605ce2..e9879a512a9b9 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 73bea9d6623e8..859a19e701fa8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -2,8 +2,8 @@
; loop-reduce-motion-test.ll
; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
-define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @pixel_asd8(
+define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_Sub(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -66,3 +66,271 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
%6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
ret i32 %6
}
+
+define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: loop.exit.landing:
+; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %height, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = add nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exitcond.not = icmp eq i32 %inc9, %height
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
+
+define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
+; CHECK-LABEL: @multi_exit(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: br label [[NEXT_COND0:%.*]]
+; CHECK: next.cond0:
+; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
+; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK: next.cond1:
+; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
+; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
+; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %next.cond1 ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %next.cond1 ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %next.cond1 ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %next.cond1 ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ br label %next.cond0
+
+next.cond0:
+ %add.8 = add i32 %add.7, 1
+ %exit1 = icmp eq i32 %inc9, %val1
+ br i1 %exit1, label %for.cond.cleanup, label %next.cond1
+
+next.cond1:
+ %add.9 = add i32 %add.7, 2
+ %exit2 = icmp eq i32 %inc9, %val2
+ br i1 %exit2, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.8, %next.cond0 ], [%add.9, %next.cond1 ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
+
+define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @phi_not_reduction_call(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT: ret i32 [[TMP7]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exit = icmp eq i32 %inc9, %val1
+ br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+ %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ %7 = add i32 %6, %sum
+ ret i32 %7
+}
+
+define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @reduction_call_not_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = sub i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT: ret i32 [[TMP7]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = sub i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exit = icmp eq i32 %inc9, %val1
+ br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+ %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ %7 = add i32 %6, %sum
+ ret i32 %7
+}
More information about the llvm-commits
mailing list