[llvm] [LoopReduceMotion] Improve loop by extract reduction instruction (PR #179215)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 23:35:10 PST 2026
https://github.com/Anjian-Wen updated https://github.com/llvm/llvm-project/pull/179215
>From ef12d82a225c2475e3a9af50a01e7dcee07e6114 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 2 Feb 2026 19:36:42 +0800
Subject: [PATCH 1/7] [RISCV] Improve loop by extract reduction instruction
with vector_reduce_add in some pattern
---
llvm/include/llvm/CodeGen/Passes.h | 4 +
.../Transforms/Vectorize/LoopReduceMotion.h | 24 ++
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassRegistry.def | 1 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 1 +
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 +
.../Transforms/Vectorize/LoopReduceMotion.cpp | 212 ++++++++++++++++++
llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 +
.../loop-reduce-motion-test.ll | 72 ++++++
9 files changed, 317 insertions(+)
create mode 100644 llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
create mode 100644 llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
create mode 100644 llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2717110e1b3e7..2bd8e843f8c13 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,6 +629,10 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
/// Lowers KCFI operand bundles for indirect calls.
LLVM_ABI FunctionPass *createKCFIPass();
+
+/// This pass is designed to hoist ReduceCall operations out of loops to
+/// reduce the number of instructions within the loop body.
+LLVM_ABI FunctionPass *createLoopReduceMotionPass();
} // namespace llvm
#endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
new file mode 100644
index 0000000000000..17bd74472700a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -0,0 +1,24 @@
+//===- LoopReduceMotion.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+namespace llvm {
+class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+ bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+};
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 45955426d66a0..e25868697a030 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -379,6 +379,7 @@
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2cfb5b2592601..edd0962052cbf 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -553,6 +553,7 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("verify", VerifierPass())
FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 7eb56f52c2e66..303a502be8cf9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,6 +479,7 @@ void RISCVPassConfig::addIRPasses() {
addPass(createRISCVGatherScatterLoweringPass());
addPass(createInterleavedAccessPass());
addPass(createRISCVCodeGenPrepareLegacyPass());
+ addPass(createLoopReduceMotionPass());
}
TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9f4a242214471..406031876a7d0 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_component_library(LLVMVectorize
LoopIdiomVectorize.cpp
LoopVectorizationLegality.cpp
LoopVectorize.cpp
+ LoopReduceMotion.cpp
SandboxVectorizer/DependencyGraph.cpp
SandboxVectorizer/InstrMaps.cpp
SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
new file mode 100644
index 0000000000000..33334a2acfa78
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -0,0 +1,212 @@
+//===-------- LoopReduceMotion.cpp - Loop Reduce Motion Optimization ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// the number of instructions within the loop body.
+//
+// Below are the target pattern to be matched and the resulting pattern
+// after the transformation.
+//
+// before | after
+// ------ | ------
+// loop: | loop:
+// ... | ...
+// vc = vecbin va, vb | vc = vecbin va, vb
+// d = reduce_add vc | vsum = vadd vsum, vc
+// sum = add sum, d | ...
+// ... | ...
+// exit: | exit:
+// value = sum | d = reduce_add sum
+// ... | value = d
+// ... | ...
+// ret | ret
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Plugins/PassPlugin.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-reduce-motion"
+
+using namespace llvm;
+
+class LoopReduceMotion : public FunctionPass {
+ LoopReduceMotionPass Impl;
+
+public:
+ static char ID;
+
+ LoopReduceMotion() : FunctionPass(ID) {}
+
+ StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+char LoopReduceMotion::ID = 0;
+
+PreservedAnalyses LoopReduceMotionPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ bool Changed = false;
+ for (Loop *L : LI) {
+ Changed |= matchAndTransform(*L, DT, LI);
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+bool LoopReduceMotion::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ bool Changed = false;
+ for (Loop *L : LI) {
+ Changed |= Impl.matchAndTransform(*L, DT, LI);
+ }
+ if (!Changed)
+ return false;
+
+ return true;
+}
+
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
+ LoopInfo &LI) {
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Latch = L.getLoopLatch();
+ BasicBlock *ExitBlock = L.getExitBlock();
+ if (!Header || !Latch || !ExitBlock) {
+ LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
+ << " because it is not a valid loop.\n");
+ return false;
+ }
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
+ << Header->getName() << ".\n");
+ return false;
+ }
+ }
+ for (PHINode &PN : Header->phis()) {
+ if (!PN.getType()->isIntegerTy())
+ continue;
+
+ RecurrenceDescriptor RecDesc;
+ if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+ continue;
+
+ if (RecDesc.getRecurrenceKind() != RecurKind::Add)
+ continue;
+
+ Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+ Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
+ if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
+ continue;
+
+ Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+ ? RecurrenceInst->getOperand(1)
+ : RecurrenceInst->getOperand(0);
+
+ CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+ if (!ReduceCall)
+ continue;
+ Function *CalledFunc = ReduceCall->getCalledFunction();
+
+ if (!CalledFunc || !CalledFunc->isIntrinsic() ||
+ !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
+ continue;
+
+ Value *ReduceOperand = ReduceCall->getArgOperand(0);
+ Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
+ if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
+ VecBin->getOpcode() != Instruction::Add))
+ continue;
+ // pattern match success
+ LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+ << Header->getName() << "!\n");
+
+ VectorType *VecTy = cast<VectorType>(VecBin->getType());
+ IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
+
+ Value *VecZero = PreheaderBuilder.CreateVectorSplat(
+ VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
+ "vec.zero");
+
+ // build new Vector Add to replace Scalar Add
+ IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
+ PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
+ VecSumPhi->addIncoming(VecZero, Preheader);
+ IRBuilder<> BodyBuilder(RecurrenceInst);
+ Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+ VecSumPhi->addIncoming(NewVecAdd, Latch);
+
+ // build landingPad for reduce add out of loop
+ BasicBlock *ExitingBlock =
+ Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
+ if (!L.isLoopExiting(ExitingBlock)) {
+ ExitingBlock = Header;
+ }
+ BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
+ LandingPad->setName("loop.exit.landing");
+ IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
+ Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
+ ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
+ Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+ Value *LastAdd =
+ PreheaderValue
+ ? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
+ : ScalarTotalSum;
+
+ // replace use of phi and erase use empty value
+ if (!PN.use_empty())
+ PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+ if (PN.use_empty())
+ PN.eraseFromParent();
+ RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+ if (RecurrenceInst->use_empty())
+ RecurrenceInst->eraseFromParent();
+ if (ReduceCall->use_empty())
+ ReduceCall->eraseFromParent();
+
+ return true;
+ }
+ return false;
+}
+
+FunctionPass *llvm::createLoopReduceMotionPass() {
+ return new LoopReduceMotion();
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 3b63c1d86d3b1..945b9bf7fd3e0 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,6 +39,7 @@
; CHECK-NEXT: RISC-V gather/scatter lowering
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: RISC-V CodeGenPrepare
+; CHECK-NEXT: Loop Reduce Motion Pass
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Canonicalize natural loops
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
new file mode 100644
index 0000000000000..dfe6b1ddb1f36
--- /dev/null
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; loop-reduce-motion-test.ll
+; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
+
+define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
+; CHECK-LABEL: define i32 @pixel_asd8(
+; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
+; CHECK: [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
+; CHECK: [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %height, 0
+ br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.preheader: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exitcond.not = icmp eq i32 %inc9, %height
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
>From 805a89422c428c87b578ff59b2b92e475480d0d7 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Tue, 3 Feb 2026 20:03:38 +0800
Subject: [PATCH 2/7] Fix format
---
.../Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
.../loop-reduce-motion-test.ll | 58 +++++++++----------
2 files changed, 28 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index 33334a2acfa78..dc9a1223bae02 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -193,7 +193,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
// replace use of phi and erase use empty value
if (!PN.use_empty())
- PN.replaceAllUsesWith(UndefValue::get(PN.getType()));
+ PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
if (PN.use_empty())
PN.eraseFromParent();
RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index dfe6b1ddb1f36..73bea9d6623e8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -1,20 +1,19 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK,CHECK-O,CHECK-O1,CHECK-O2,CHECK-O3,CHECK-O-NEXT --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; loop-reduce-motion-test.ll
; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
-define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %stride1, ptr noundef readonly captures(none) %pix2, i64 noundef %stride2, i32 noundef signext %height) local_unnamed_addr #0 {
-; CHECK-LABEL: define i32 @pixel_asd8(
-; CHECK-SAME: ptr noundef readonly captures(none) [[PIX1:%.*]], i64 noundef [[STRIDE1:%.*]], ptr noundef readonly captures(none) [[PIX2:%.*]], i64 noundef [[STRIDE2:%.*]], i32 noundef signext [[HEIGHT:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT]], 0
-; CHECK-NEXT: br i1 [[CMP21]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
-; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]:
-; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
-; CHECK: [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_COND1_PREHEADER]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX1]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[PIX2]], %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ]
+define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @pixel_asd8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -22,33 +21,30 @@ define i32 @pixel_asd8(ptr noundef readonly captures(none) %pix1, i64 noundef %
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1]]
-; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2]]
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_EXIT_LANDING:.*]], label %[[FOR_COND1_PREHEADER]]
-; CHECK: [[LOOP_EXIT_LANDING]]:
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: loop.exit.landing:
; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
-; CHECK-NEXT: br label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]]
-; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
-; CHECK: [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
; CHECK-NEXT: ret i32 [[TMP6]]
;
entry:
%cmp21 = icmp sgt i32 %height, 0
- br i1 %cmp21, label %for.cond1.preheader.preheader, label %for.cond.cleanup
-
-for.cond1.preheader.preheader: ; preds = %entry
- br label %for.cond1.preheader
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
- %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
- %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
- %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %for.cond1.preheader.preheader ]
- %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %for.cond1.preheader.preheader ]
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
%0 = load <8 x i8>, ptr %pix1.addr.023
%1 = load <8 x i8>, ptr %pix2.addr.022
%2 = zext <8 x i8> %0 to <8 x i32>
>From 2134c689883bb02affee9bc453a19876d58794ff Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Wed, 4 Feb 2026 19:26:03 +0800
Subject: [PATCH 3/7] Fix some format and possible nullptr dyn_cast
---
.../Transforms/Vectorize/LoopReduceMotion.h | 6 ++++--
llvm/lib/Passes/PassRegistry.def | 2 +-
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 2 +-
.../Transforms/Vectorize/LoopReduceMotion.cpp | 17 +++++++++--------
4 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index 17bd74472700a..df5af76819923 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
// the number of instructions within the loop body.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/PassManager.h"
+
namespace llvm {
class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
public:
@@ -21,4 +23,4 @@ class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
};
} // namespace llvm
-#endif
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index edd0962052cbf..c896d7c99c107 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,6 +469,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
FUNCTION_PASS("loop-distribute", LoopDistributePass())
FUNCTION_PASS("loop-fusion", LoopFusePass())
FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
+FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
FUNCTION_PASS("loop-sink", LoopSinkPass())
FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -553,7 +554,6 @@ FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
FUNCTION_PASS("vector-combine", VectorCombinePass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("verify", VerifierPass())
FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 406031876a7d0..0fa532010632b 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,9 +1,9 @@
add_llvm_component_library(LLVMVectorize
LoadStoreVectorizer.cpp
LoopIdiomVectorize.cpp
+ LoopReduceMotion.cpp
LoopVectorizationLegality.cpp
LoopVectorize.cpp
- LoopReduceMotion.cpp
SandboxVectorizer/DependencyGraph.cpp
SandboxVectorizer/InstrMaps.cpp
SandboxVectorizer/Interval.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index dc9a1223bae02..fdefcb7e00074 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -5,10 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-// This pass is designed to hoist `ReduceCall` operations out of loops to reduce
+// This pass is designed to sink `ReduceCall` operations out of loops to reduce
// the number of instructions within the loop body.
//
-// Below are the target pattern to be matched and the resulting pattern
+// Below is the target pattern to be matched and the resulting pattern
// after the transformation.
//
// before | after
@@ -160,11 +160,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
<< Header->getName() << "!\n");
VectorType *VecTy = cast<VectorType>(VecBin->getType());
- IRBuilder<> PreheaderBuilder(Preheader->getTerminator());
-
- Value *VecZero = PreheaderBuilder.CreateVectorSplat(
- VecTy->getElementCount(), ConstantInt::get(VecTy->getElementType(), 0),
- "vec.zero");
+ Value *VecZero = ConstantInt::get(VecTy, 0);
// build new Vector Add to replace Scalar Add
IRBuilder<> HeaderBuilder(Header, Header->getFirstNonPHIIt());
@@ -196,7 +192,12 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
if (PN.use_empty())
PN.eraseFromParent();
- RecurrenceInst->replaceAllUsesWith(dyn_cast<Instruction>(LastAdd));
+
+ Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
+ if (!FinalNode)
+ return false;
+ RecurrenceInst->replaceAllUsesWith(FinalNode);
+
if (RecurrenceInst->use_empty())
RecurrenceInst->eraseFromParent();
if (ReduceCall->use_empty())
>From de183c53008b7f4a28aa1cd93c43fb48bd9d96d6 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Feb 2026 19:25:25 +0800
Subject: [PATCH 4/7] delete some wrong log
---
llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index fdefcb7e00074..f9b45ade53676 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -156,7 +156,7 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
VecBin->getOpcode() != Instruction::Add))
continue;
// pattern match success
- LLVM_DEBUG(dbgs() << "FRM: Found pattern to optimize in loop "
+ LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
<< Header->getName() << "!\n");
VectorType *VecTy = cast<VectorType>(VecBin->getType());
>From ac4e3131d3d015a24e8e61c178ceb40d5a693a3b Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 15:02:58 +0800
Subject: [PATCH 5/7] put the pass after VectorCombine add some test negative
tests change class name in case of confused change Node delete func to
RecursivelyDelete functions
---
llvm/lib/Passes/PassBuilderPipelines.cpp | 3 +
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 1 -
.../Transforms/Vectorize/LoopReduceMotion.cpp | 23 +-
llvm/test/CodeGen/RISCV/O3-pipeline.ll | 1 -
llvm/test/Other/new-pm-defaults.ll | 1 +
llvm/test/Other/new-pm-lto-defaults.ll | 1 +
.../Other/new-pm-thinlto-postlink-defaults.ll | 1 +
.../new-pm-thinlto-postlink-pgo-defaults.ll | 1 +
...-pm-thinlto-postlink-samplepgo-defaults.ll | 1 +
.../loop-reduce-motion-test.ll | 272 +++++++++++++++++-
10 files changed, 287 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 1584d30875570..8579da78be8c8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -147,6 +147,7 @@
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Vectorize/LoopReduceMotion.h"
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/Transforms/Vectorize/VectorCombine.h"
@@ -1418,6 +1419,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
}
// Enhance/cleanup vector code.
FPM.addPass(VectorCombinePass());
+ // Try to sink ReduceCall out of loop
+ FPM.addPass(LoopReduceMotionPass());
if (!IsFullLTO) {
FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 303a502be8cf9..7eb56f52c2e66 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -479,7 +479,6 @@ void RISCVPassConfig::addIRPasses() {
addPass(createRISCVGatherScatterLoweringPass());
addPass(createInterleavedAccessPass());
addPass(createRISCVCodeGenPrepareLegacyPass());
- addPass(createLoopReduceMotionPass());
}
TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f9b45ade53676..f56a546c5ceb9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -46,13 +46,13 @@
using namespace llvm;
-class LoopReduceMotion : public FunctionPass {
+class LoopReduceMotionLegacy : public FunctionPass {
LoopReduceMotionPass Impl;
public:
static char ID;
- LoopReduceMotion() : FunctionPass(ID) {}
+ LoopReduceMotionLegacy() : FunctionPass(ID) {}
StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
@@ -65,7 +65,7 @@ class LoopReduceMotion : public FunctionPass {
}
};
-char LoopReduceMotion::ID = 0;
+char LoopReduceMotionLegacy::ID = 0;
PreservedAnalyses LoopReduceMotionPass::run(Function &F,
FunctionAnalysisManager &FAM) {
@@ -80,7 +80,7 @@ PreservedAnalyses LoopReduceMotionPass::run(Function &F,
return PreservedAnalyses::none();
}
-bool LoopReduceMotion::runOnFunction(Function &F) {
+bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -187,21 +187,16 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
: ScalarTotalSum;
- // replace use of phi and erase use empty value
+ // delete the dead PHI Node
if (!PN.use_empty())
PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
- if (PN.use_empty())
- PN.eraseFromParent();
-
+ llvm::RecursivelyDeleteDeadPHINode(&PN);
+ // replace the use of Recurrence Node and delete the dead Node
Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
if (!FinalNode)
return false;
RecurrenceInst->replaceAllUsesWith(FinalNode);
-
- if (RecurrenceInst->use_empty())
- RecurrenceInst->eraseFromParent();
- if (ReduceCall->use_empty())
- ReduceCall->eraseFromParent();
+ llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
return true;
}
@@ -209,5 +204,5 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
}
FunctionPass *llvm::createLoopReduceMotionPass() {
- return new LoopReduceMotion();
+ return new LoopReduceMotionLegacy();
}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 945b9bf7fd3e0..3b63c1d86d3b1 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -39,7 +39,6 @@
; CHECK-NEXT: RISC-V gather/scatter lowering
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: RISC-V CodeGenPrepare
-; CHECK-NEXT: Loop Reduce Motion Pass
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Canonicalize natural loops
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index f074b2fdd3ab8..9af05eb49e65e 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index de0feca55e5b2..cebbe671e17b3 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,6 +143,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index b0d08316de4f0..4065cead7c264 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,7 @@
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 6b3e82a752899..126caf7eed3ab 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 88dc18f605ce2..e9879a512a9b9 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,7 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 73bea9d6623e8..859a19e701fa8 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -2,8 +2,8 @@
; loop-reduce-motion-test.ll
; RUN: opt -passes=loop-reduce-motion -S < %s | FileCheck %s
-define i32 @pixel_asd8(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @pixel_asd8(
+define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_Sub(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -66,3 +66,271 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
%6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
ret i32 %6
}
+
+define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_VecBin_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: loop.exit.landing:
+; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %height, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = add nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exitcond.not = icmp eq i32 %inc9, %height
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
+
+define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
+; CHECK-LABEL: @multi_exit(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: br label [[NEXT_COND0:%.*]]
+; CHECK: next.cond0:
+; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
+; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK: next.cond1:
+; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
+; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
+; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %next.cond1 ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %next.cond1 ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %next.cond1 ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %next.cond1 ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ br label %next.cond0
+
+next.cond0:
+ %add.8 = add i32 %add.7, 1
+ %exit1 = icmp eq i32 %inc9, %val1
+ br i1 %exit1, label %for.cond.cleanup, label %next.cond1
+
+next.cond1:
+ %add.9 = add i32 %add.7, 2
+ %exit2 = icmp eq i32 %inc9, %val2
+ br i1 %exit2, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.8, %next.cond0 ], [%add.9, %next.cond1 ]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ ret i32 %6
+}
+
+define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @phi_not_reduction_call(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = add i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT: ret i32 [[TMP7]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = add i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exit = icmp eq i32 %inc9, %val1
+ br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+ %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ %7 = add i32 %6, %sum
+ ret i32 %7
+}
+
+define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1) {
+; CHECK-LABEL: @reduction_call_not_add(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK: for.cond1.preheader:
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
+; CHECK-NEXT: [[ADD_7]] = sub i32 [[SUM_024]], [[TMP5]]
+; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
+; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
+; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
+; CHECK-NEXT: ret i32 [[TMP7]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %val1, 0
+ br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader: ; preds = %for.cond1.preheader
+ %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
+ %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr.023
+ %1 = load <8 x i8>, ptr %pix2.addr.022
+ %2 = zext <8 x i8> %0 to <8 x i32>
+ %3 = zext <8 x i8> %1 to <8 x i32>
+ %4 = sub nsw <8 x i32> %2, %3
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %add.7 = sub i32 %sum.024, %5
+ %inc9 = add nuw nsw i32 %y.025, 1
+ %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
+ %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
+ %exit = icmp eq i32 %inc9, %val1
+ br i1 %exit, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
+
+for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [%add.7, %for.cond.cleanup.loopexit ]
+ %sum = phi i32 [0, %entry], [ %sum.024, %for.cond.cleanup.loopexit]
+ %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
+ %7 = add i32 %6, %sum
+ ret i32 %7
+}
>From 237b245dfb33ccc61e9e78d0de44defa3a8fa2c9 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Mon, 9 Feb 2026 16:33:38 +0800
Subject: [PATCH 6/7] update comment
---
llvm/include/llvm/CodeGen/Passes.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2bd8e843f8c13..0745f0a408d67 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -630,7 +630,7 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
/// Lowers KCFI operand bundles for indirect calls.
LLVM_ABI FunctionPass *createKCFIPass();
-/// This pass is designed to hoist ReduceCall operations out of loops to
+/// This pass is designed to sink ReduceCall operations out of loops to
/// reduce the number of instructions within the loop body.
LLVM_ABI FunctionPass *createLoopReduceMotionPass();
} // namespace llvm
>From be1f11eafc39139991621c4153ce559e12508a68 Mon Sep 17 00:00:00 2001
From: Anjian-Wen <wenanjian at bytedance.com>
Date: Thu, 5 Mar 2026 15:31:01 +0800
Subject: [PATCH 7/7] Change the Pass to Loop Pass and fix some error
---
llvm/include/llvm/CodeGen/Passes.h | 4 -
.../Transforms/Vectorize/LoopReduceMotion.h | 7 +-
llvm/lib/Passes/PassBuilderPipelines.cpp | 5 +-
llvm/lib/Passes/PassRegistry.def | 2 +-
.../Transforms/Vectorize/LoopReduceMotion.cpp | 171 ++++++++---------
llvm/test/Other/new-pm-defaults.ll | 2 +
llvm/test/Other/new-pm-lto-defaults.ll | 8 +-
.../Other/new-pm-thinlto-postlink-defaults.ll | 2 +
.../new-pm-thinlto-postlink-pgo-defaults.ll | 2 +
...-pm-thinlto-postlink-samplepgo-defaults.ll | 2 +
.../loop-reduce-motion-test.ll | 143 +++++++-------
.../PhaseOrdering/AArch64/udotabd.ll | 174 +++++++-----------
12 files changed, 245 insertions(+), 277 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 0745f0a408d67..2717110e1b3e7 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -629,10 +629,6 @@ LLVM_ABI ModulePass *createWindowsSecureHotPatchingPass();
/// Lowers KCFI operand bundles for indirect calls.
LLVM_ABI FunctionPass *createKCFIPass();
-
-/// This pass is designed to sink ReduceCall operations out of loops to
-/// reduce the number of instructions within the loop body.
-LLVM_ABI FunctionPass *createLoopReduceMotionPass();
} // namespace llvm
#endif
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
index df5af76819923..26d178c63b0c9 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopReduceMotion.h
@@ -13,14 +13,17 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
#define LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
namespace llvm {
class LoopReduceMotionPass : public PassInfoMixin<LoopReduceMotionPass> {
public:
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
- bool matchAndTransform(Loop &L, DominatorTree &DT, LoopInfo &LI);
+ bool matchAndTransform(Loop &L, DominatorTree *DT, LoopInfo *LI);
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
};
} // namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPREDUCEMOTION_H
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 8579da78be8c8..4897cc950322f 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1419,8 +1419,11 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
}
// Enhance/cleanup vector code.
FPM.addPass(VectorCombinePass());
+
+ LoopPassManager LPM;
// Try to sink ReduceCall out of loop
- FPM.addPass(LoopReduceMotionPass());
+ LPM.addPass(LoopReduceMotionPass());
+ FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
if (!IsFullLTO) {
FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index c896d7c99c107..5d8e314de4ce1 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -469,7 +469,6 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
FUNCTION_PASS("loop-distribute", LoopDistributePass())
FUNCTION_PASS("loop-fusion", LoopFusePass())
FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
-FUNCTION_PASS("loop-reduce-motion", LoopReduceMotionPass())
FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
FUNCTION_PASS("loop-sink", LoopSinkPass())
FUNCTION_PASS("loop-versioning", LoopVersioningPass())
@@ -777,6 +776,7 @@ LOOP_PASS("loop-idiom-vectorize", LoopIdiomVectorizePass())
LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
LOOP_PASS("loop-predication", LoopPredicationPass())
LOOP_PASS("loop-reduce", LoopStrengthReducePass())
+LOOP_PASS("loop-reduce-motion", LoopReduceMotionPass())
LOOP_PASS("loop-term-fold", LoopTermFoldPass())
LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
diff --git a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
index f56a546c5ceb9..e54ba56ec3e32 100644
--- a/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopReduceMotion.cpp
@@ -15,8 +15,7 @@
// ------ | ------
// loop: | loop:
// ... | ...
-// vc = vecbin va, vb | vc = vecbin va, vb
-// d = reduce_add vc | vsum = vadd vsum, vc
+// d = reduce_add v | vsum = vadd vsum, v
// sum = add sum, d | ...
// ... | ...
// exit: | exit:
@@ -35,6 +34,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Plugins/PassPlugin.h"
@@ -45,68 +45,24 @@
#define DEBUG_TYPE "loop-reduce-motion"
using namespace llvm;
+PreservedAnalyses LoopReduceMotionPass::run(Loop &L, LoopAnalysisManager &LAM,
+ LoopStandardAnalysisResults &LAR,
+ LPMUpdater &Updater) {
-class LoopReduceMotionLegacy : public FunctionPass {
- LoopReduceMotionPass Impl;
+ bool Changed = matchAndTransform(L, &LAR.DT, &LAR.LI);
-public:
- static char ID;
-
- LoopReduceMotionLegacy() : FunctionPass(ID) {}
-
- StringRef getPassName() const override { return "Loop Reduce Motion Pass"; }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-char LoopReduceMotionLegacy::ID = 0;
-
-PreservedAnalyses LoopReduceMotionPass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
- DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- bool Changed = false;
- for (Loop *L : LI) {
- Changed |= matchAndTransform(*L, DT, LI);
- }
if (!Changed)
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
-bool LoopReduceMotionLegacy::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
-
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- bool Changed = false;
- for (Loop *L : LI) {
- Changed |= Impl.matchAndTransform(*L, DT, LI);
- }
- if (!Changed)
- return false;
-
- return true;
-}
-
-bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
- LoopInfo &LI) {
+bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree *DT,
+ LoopInfo *LI) {
BasicBlock *Header = L.getHeader();
BasicBlock *Latch = L.getLoopLatch();
BasicBlock *ExitBlock = L.getExitBlock();
+ BasicBlock *ExitingBlock = L.getExitingBlock();
+ BasicBlock *LandingPad = nullptr;
if (!Header || !Latch || !ExitBlock) {
LLVM_DEBUG(dbgs() << "LRM: Skipping loop " << Header->getName()
<< " because it is not a valid loop.\n");
@@ -114,52 +70,70 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
}
BasicBlock *Preheader = L.getLoopPreheader();
if (!Preheader) {
- Preheader = InsertPreheaderForLoop(&L, &DT, &LI, nullptr, false);
+ Preheader = InsertPreheaderForLoop(&L, DT, LI, nullptr, false);
if (!Preheader) {
LLVM_DEBUG(dbgs() << "LRM: Failed to create a preheader for loop "
<< Header->getName() << ".\n");
return false;
}
}
+
+ bool transform_success = false;
+ SmallVector<Instruction *, 8> StackRecur;
+ SmallVector<PHINode *, 8> Stack;
+ int phi_count = 0;
for (PHINode &PN : Header->phis()) {
- if (!PN.getType()->isIntegerTy())
+ Stack.push_back(&PN);
+ phi_count++;
+ if (phi_count >= 8)
+ return false;
+ }
+
+ while (!Stack.empty()) {
+ PHINode *PN = Stack.pop_back_val();
+
+ if (!PN->getType()->isIntegerTy())
continue;
RecurrenceDescriptor RecDesc;
- if (!RecurrenceDescriptor::isReductionPHI(&PN, &L, RecDesc))
+ if (!RecurrenceDescriptor::isReductionPHI(PN, &L, RecDesc))
continue;
if (RecDesc.getRecurrenceKind() != RecurKind::Add)
continue;
- Value *RecurrenceValueFromPHI = PN.getIncomingValueForBlock(Latch);
+ Value *RecurrenceValueFromPHI = PN->getIncomingValueForBlock(Latch);
Instruction *RecurrenceInst = dyn_cast<Instruction>(RecurrenceValueFromPHI);
if (!RecurrenceInst || RecurrenceInst->getNumOperands() != 2)
continue;
- Value *RecurrenceValue = RecurrenceInst->getOperand(0) == &PN
+ // Don't match if the Recurrence Value has other use in loop
+ for (User *U : RecurrenceValueFromPHI->users()) {
+ if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+ BasicBlock *BB = Inst->getParent();
+ if (L.contains(BB)) {
+ continue;
+ }
+ }
+ }
+
+ Value *RecurrenceValue = RecurrenceInst->getOperand(0) == PN
? RecurrenceInst->getOperand(1)
: RecurrenceInst->getOperand(0);
-
- CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
- if (!ReduceCall)
+ Value *ReduceOperand;
+ if (!llvm::PatternMatch::match(
+ RecurrenceValue,
+ llvm::PatternMatch::m_Intrinsic<Intrinsic::vector_reduce_add>(
+ llvm::PatternMatch::m_Value(ReduceOperand))))
continue;
- Function *CalledFunc = ReduceCall->getCalledFunction();
- if (!CalledFunc || !CalledFunc->isIntrinsic() ||
- !(CalledFunc->getIntrinsicID() == Intrinsic::vector_reduce_add))
- continue;
-
- Value *ReduceOperand = ReduceCall->getArgOperand(0);
- Instruction *VecBin = dyn_cast<Instruction>(ReduceOperand);
- if (!VecBin || (VecBin->getOpcode() != Instruction::Sub &&
- VecBin->getOpcode() != Instruction::Add))
- continue;
+ CallInst *ReduceCall = dyn_cast<CallInst>(RecurrenceValue);
+ Instruction *VecIn = dyn_cast<Instruction>(ReduceOperand);
// pattern match success
LLVM_DEBUG(dbgs() << "Found pattern to optimize in loop "
<< Header->getName() << "!\n");
- VectorType *VecTy = cast<VectorType>(VecBin->getType());
+ VectorType *VecTy = cast<VectorType>(VecIn->getType());
Value *VecZero = ConstantInt::get(VecTy, 0);
// build new Vector Add to replace Scalar Add
@@ -167,42 +141,53 @@ bool LoopReduceMotionPass::matchAndTransform(Loop &L, DominatorTree &DT,
PHINode *VecSumPhi = HeaderBuilder.CreatePHI(VecTy, 2, "vec.sum.phi");
VecSumPhi->addIncoming(VecZero, Preheader);
IRBuilder<> BodyBuilder(RecurrenceInst);
- Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecBin, "vec.sum.next");
+ Value *NewVecAdd = BodyBuilder.CreateAdd(VecSumPhi, VecIn, "vec.sum.next");
VecSumPhi->addIncoming(NewVecAdd, Latch);
// build landingPad for reduce add out of loop
- BasicBlock *ExitingBlock =
- Latch->getTerminator()->getSuccessor(0) == Header ? Latch : Header;
- if (!L.isLoopExiting(ExitingBlock)) {
- ExitingBlock = Header;
+ if (!LandingPad) {
+ LandingPad = SplitEdge(ExitingBlock, ExitBlock, DT, LI);
+ LandingPad->setName("loop.exit.landing");
}
- BasicBlock *LandingPad = SplitEdge(ExitingBlock, ExitBlock, &DT, &LI);
- LandingPad->setName("loop.exit.landing");
IRBuilder<> LandingPadBuilder(LandingPad->getTerminator());
Value *ScalarTotalSum = LandingPadBuilder.CreateCall(
ReduceCall->getCalledFunction(), NewVecAdd, "scalar.total.sum");
- Value *PreheaderValue = PN.getIncomingValueForBlock(Preheader);
+
+ Value *PreheaderValue = PN->getIncomingValueForBlock(Preheader);
Value *LastAdd =
PreheaderValue
? LandingPadBuilder.CreateAdd(PreheaderValue, ScalarTotalSum)
: ScalarTotalSum;
-
- // delete the dead PHI Node
- if (!PN.use_empty())
- PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
- llvm::RecursivelyDeleteDeadPHINode(&PN);
// replace the use of Recurrence Node and delete the dead Node
Instruction *FinalNode = dyn_cast<Instruction>(LastAdd);
if (!FinalNode)
- return false;
- RecurrenceInst->replaceAllUsesWith(FinalNode);
- llvm::RecursivelyDeleteTriviallyDeadInstructions(RecurrenceInst);
+ continue;
+ // delete the dead PHI Node
+ if (!PN->use_empty())
+ PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
+ llvm::RecursivelyDeleteDeadPHINode(PN);
+
+ if (!RecurrenceInst->use_empty()) {
+ for (auto *U : RecurrenceInst->users()) {
+ auto *phi = llvm::dyn_cast<llvm::PHINode>(U);
+ if (phi && !phi->use_empty()) {
+ phi->replaceAllUsesWith(FinalNode);
+ }
+ }
+ }
+ transform_success = true;
+ StackRecur.push_back(RecurrenceInst);
+ }
+
+ if (transform_success) {
+ FoldSingleEntryPHINodes(LandingPad);
+ while (!StackRecur.empty()) {
+ Instruction *Rec = StackRecur.pop_back_val();
+ llvm::RecursivelyDeleteTriviallyDeadInstructions(Rec);
+ }
return true;
}
- return false;
-}
-FunctionPass *llvm::createLoopReduceMotionPass() {
- return new LoopReduceMotionLegacy();
+ return false;
}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 9af05eb49e65e..912320b0b2946 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -269,6 +269,8 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index cebbe671e17b3..ec2be84bb919a 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -143,9 +143,11 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo
; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo
-; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass on foo
-; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
-; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Running pass: LoopReduceMotionPass
+; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass
+; CHECK-O23SZ-NEXT: Running pass: InstCombinePass
; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
; CHECK-O23SZ-NEXT: Running pass: LICMPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 4065cead7c264..40a8d1383032d 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -188,6 +188,8 @@
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 126caf7eed3ab..c6343f0e0a774 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -173,6 +173,8 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index e9879a512a9b9..add8fc47c54f1 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -182,6 +182,8 @@
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O-NEXT: Running pass: LCSSAPass
; CHECK-O-NEXT: Running pass: LoopReduceMotionPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
diff --git a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
index 859a19e701fa8..cd77a0cfec68c 100644
--- a/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
+++ b/llvm/test/Transforms/LoopReduceMotion/loop-reduce-motion-test.ll
@@ -12,124 +12,122 @@ define i32 @func_with_VecBin_Sub(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride
; CHECK: for.cond1.preheader:
; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
-; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP1]]
; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
-; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
; CHECK: loop.exit.landing:
; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
-; CHECK-NEXT: ret i32 [[TMP6]]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
+; CHECK-NEXT: ret i32 [[TMP3]]
;
entry:
%cmp21 = icmp sgt i32 %height, 0
br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
-for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+for.cond1.preheader: ; preds = %for.cond1.preheader, %entry
%y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
%sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
%pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
- %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
%0 = load <8 x i8>, ptr %pix1.addr.023
- %1 = load <8 x i8>, ptr %pix2.addr.022
%2 = zext <8 x i8> %0 to <8 x i32>
- %3 = zext <8 x i8> %1 to <8 x i32>
- %4 = sub nsw <8 x i32> %2, %3
- %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%add.7 = add i32 %sum.024, %5
%inc9 = add nuw nsw i32 %y.025, 1
%add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
- %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
%exitcond.not = icmp eq i32 %inc9, %height
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
-
-for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
- br label %for.cond.cleanup
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
+for.cond.cleanup: ; preds = %for.cond1.preheader, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond1.preheader]
%6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
ret i32 %6
}
-define i32 @func_with_VecBin_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
-; CHECK-LABEL: @func_with_VecBin_add(
+define i32 @func_with_reduce(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %height) {
+; CHECK-LABEL: @func_with_reduce(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[HEIGHT:%.*]], 0
; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; CHECK: for.cond1.preheader.preheader:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
; CHECK: for.cond1.preheader:
-; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR1:%.*]] = phi ptr [ [[ADD_PTR1:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR2:%.*]] = phi ptr [ [[ADD_PTR2:%.*]], [[FOR_COND1_PREHEADER]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER_PREHEADER]] ]
; CHECK-NEXT: [[VEC_SUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
+; CHECK-NEXT: [[VEC_SUM_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[FOR_COND1_PREHEADER_PREHEADER]] ], [ [[VEC_SUM_NEXT2:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR2]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP4]]
-; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[Y_025]], 1
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, ptr [[PIX1_ADDR_023]], i64 [[STRIDE1:%.*]]
-; CHECK-NEXT: [[ADD_PTR10]] = getelementptr inbounds i8, ptr [[PIX2_ADDR_022]], i64 [[STRIDE2:%.*]]
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[HEIGHT]]
+; CHECK-NEXT: [[VEC_SUM_NEXT2]] = add <8 x i32> [[VEC_SUM_PHI1]], [[TMP2]]
+; CHECK-NEXT: [[VEC_SUM_NEXT]] = add <8 x i32> [[VEC_SUM_PHI]], [[TMP3]]
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[Y]], 1
+; CHECK-NEXT: [[ADD_PTR1]] = getelementptr inbounds i8, ptr [[PIX1_ADDR1]], i64 [[STRIDE1:%.*]]
+; CHECK-NEXT: [[ADD_PTR2]] = getelementptr inbounds i8, ptr [[PIX2_ADDR2]], i64 [[STRIDE2:%.*]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[HEIGHT]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LANDING:%.*]], label [[FOR_COND1_PREHEADER]]
; CHECK: loop.exit.landing:
; CHECK-NEXT: [[SCALAR_TOTAL_SUM:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT]])
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM]]
+; CHECK-NEXT: [[SCALAR_TOTAL_SUM3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[VEC_SUM_NEXT2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 0, [[SCALAR_TOTAL_SUM3]]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
-; CHECK-NEXT: ret i32 [[TMP6]]
+; CHECK-NEXT: [[SUML1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUML2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP4]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUML1]], i1 true)
+; CHECK-NEXT: [[TMP7:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUML2]], i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: ret i32 [[TMP8]]
;
entry:
%cmp21 = icmp sgt i32 %height, 0
br i1 %cmp21, label %for.cond1.preheader, label %for.cond.cleanup
for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
- %y.025 = phi i32 [ %inc9, %for.cond1.preheader ], [ 0, %entry ]
- %sum.024 = phi i32 [ %add.7, %for.cond1.preheader ], [ 0, %entry ]
- %pix1.addr.023 = phi ptr [ %add.ptr, %for.cond1.preheader ], [ %pix1, %entry ]
- %pix2.addr.022 = phi ptr [ %add.ptr10, %for.cond1.preheader ], [ %pix2, %entry ]
- %0 = load <8 x i8>, ptr %pix1.addr.023
- %1 = load <8 x i8>, ptr %pix2.addr.022
+ %y = phi i32 [ %inc, %for.cond1.preheader ], [ 0, %entry ]
+ %sum1 = phi i32 [ %add1, %for.cond1.preheader ], [ 0, %entry ]
+ %sum2 = phi i32 [ %add2, %for.cond1.preheader ], [ 0, %entry ]
+ %pix1.addr1 = phi ptr [ %add.ptr1, %for.cond1.preheader ], [ %pix1, %entry ]
+ %pix2.addr2 = phi ptr [ %add.ptr2, %for.cond1.preheader ], [ %pix2, %entry ]
+ %0 = load <8 x i8>, ptr %pix1.addr1
+ %1 = load <8 x i8>, ptr %pix2.addr2
%2 = zext <8 x i8> %0 to <8 x i32>
%3 = zext <8 x i8> %1 to <8 x i32>
- %4 = add nsw <8 x i32> %2, %3
- %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
- %add.7 = add i32 %sum.024, %5
- %inc9 = add nuw nsw i32 %y.025, 1
- %add.ptr = getelementptr inbounds i8, ptr %pix1.addr.023, i64 %stride1
- %add.ptr10 = getelementptr inbounds i8, ptr %pix2.addr.022, i64 %stride2
- %exitcond.not = icmp eq i32 %inc9, %height
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
+ %add1 = add i32 %sum1, %5
+ %6 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
+ %add2 = add i32 %sum2, %6
+ %inc = add nuw nsw i32 %y, 1
+ %add.ptr1 = getelementptr inbounds i8, ptr %pix1.addr1, i64 %stride1
+ %add.ptr2 = getelementptr inbounds i8, ptr %pix2.addr2, i64 %stride2
+ %exitcond.not = icmp eq i32 %inc, %height
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.cond1.preheader
for.cond.cleanup.loopexit: ; preds = %for.cond1.preheader
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.7, %for.cond.cleanup.loopexit ]
- %6 = tail call i32 @llvm.abs.i32(i32 %sum.0.lcssa, i1 true)
- ret i32 %6
+ %suml1 = phi i32 [ 0, %entry ], [ %add1, %for.cond.cleanup.loopexit ]
+ %suml2 = phi i32 [ 0, %entry ], [ %add2, %for.cond.cleanup.loopexit ]
+ %7 = tail call i32 @llvm.abs.i32(i32 %suml1, i1 true)
+ %8 = tail call i32 @llvm.abs.i32(i32 %suml2, i1 true)
+ %9 = add i32 %7, %8
+ ret i32 %9
}
define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %val1, i32 %val2) {
@@ -137,11 +135,13 @@ define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %va
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[VAL1:%.*]], 0
; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.cond1.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND1_PREHEADER1:%.*]]
; CHECK: for.cond1.preheader:
-; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[ENTRY]] ]
-; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[ENTRY]] ]
+; CHECK-NEXT: [[Y_025:%.*]] = phi i32 [ [[INC9:%.*]], [[NEXT_COND1:%.*]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[SUM_024:%.*]] = phi i32 [ [[ADD_7:%.*]], [[NEXT_COND1]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[PIX1_ADDR_023:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[NEXT_COND1]] ], [ [[PIX1:%.*]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[PIX2_ADDR_022:%.*]] = phi ptr [ [[ADD_PTR10:%.*]], [[NEXT_COND1]] ], [ [[PIX2:%.*]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PIX1_ADDR_023]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PIX2_ADDR_022]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
@@ -156,13 +156,16 @@ define i32 @multi_exit(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stride2, i32 %va
; CHECK: next.cond0:
; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], 1
; CHECK-NEXT: [[EXIT1:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
-; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR_COND_CLEANUP]], label [[NEXT_COND1]]
+; CHECK-NEXT: br i1 [[EXIT1]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[NEXT_COND1]]
; CHECK: next.cond1:
; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_7]], 2
; CHECK-NEXT: [[EXIT2:%.*]] = icmp eq i32 [[INC9]], [[VAL2:%.*]]
-; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER]]
+; CHECK-NEXT: br i1 [[EXIT2]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_COND1_PREHEADER1]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[SUM_0_LCSSA_PH:%.*]] = phi i32 [ [[ADD_9]], [[NEXT_COND1]] ], [ [[ADD_8]], [[NEXT_COND0]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8]], [[NEXT_COND0]] ], [ [[ADD_9]], [[NEXT_COND1]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_0_LCSSA_PH]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
; CHECK-NEXT: ret i32 [[TMP6]]
;
@@ -228,10 +231,12 @@ define i32 @phi_not_reduction_call(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stri
; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[SUM_024_LCSSA:%.*]] = phi i32 [ [[SUM_024]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[ADD_7_LCSSA:%.*]] = phi i32 [ [[ADD_7]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
; CHECK-NEXT: ret i32 [[TMP7]]
@@ -294,10 +299,12 @@ define i32 @reduction_call_not_add(ptr %pix1, i64 %stride1, ptr %pix2, i64 %stri
; CHECK-NEXT: [[EXIT:%.*]] = icmp eq i32 [[INC9]], [[VAL1]]
; CHECK-NEXT: br i1 [[EXIT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[SUM_024_LCSSA:%.*]] = phi i32 [ [[SUM_024]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT: [[ADD_7_LCSSA:%.*]] = phi i32 [ [[ADD_7]], [[FOR_COND1_PREHEADER]] ]
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_7_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_024_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.abs.i32(i32 [[SUM_0_LCSSA]], i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[SUM]]
; CHECK-NEXT: ret i32 [[TMP7]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
index e2f7f8f7e5cac..16e66acce373f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll
@@ -9,8 +9,6 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
; CHECK-O3-LABEL: define dso_local i32 @test(
; CHECK-O3-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-O3-NEXT: [[ENTRY:.*:]]
-; CHECK-O3-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
-; CHECK-O3-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
; CHECK-O3-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
; CHECK-O3-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
@@ -18,352 +16,318 @@ define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2,
; CHECK-O3-NEXT: [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
; CHECK-O3-NEXT: [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false)
; CHECK-O3-NEXT: [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-O3-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
; CHECK-O3-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
+; CHECK-O3-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
+; CHECK-O3-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP7]] to <16 x i16>
+; CHECK-O3-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
; CHECK-O3-NEXT: [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
-; CHECK-O3-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
-; CHECK-O3-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]]
; CHECK-O3-NEXT: [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false)
; CHECK-O3-NEXT: [[TMP14:%.*]] = zext <16 x i16> [[TMP13]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
-; CHECK-O3-NEXT: [[OP_RDX_1:%.*]] = add nuw nsw i32 [[TMP15]], [[TMP7]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_1:%.*]] = add nuw nsw <16 x i32> [[TMP6]], [[TMP14]]
; CHECK-O3-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]]
; CHECK-O3-NEXT: [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false)
; CHECK-O3-NEXT: [[TMP22:%.*]] = zext <16 x i16> [[TMP21]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP22]])
-; CHECK-O3-NEXT: [[OP_RDX_2:%.*]] = add nuw nsw i32 [[TMP23]], [[OP_RDX_1]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_2:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_1]], [[TMP22]]
; CHECK-O3-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
; CHECK-O3-NEXT: [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false)
; CHECK-O3-NEXT: [[TMP30:%.*]] = zext <16 x i16> [[TMP29]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP31:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP30]])
-; CHECK-O3-NEXT: [[OP_RDX_3:%.*]] = add nuw nsw i32 [[TMP31]], [[OP_RDX_2]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_3:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_2]], [[TMP30]]
; CHECK-O3-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]]
; CHECK-O3-NEXT: [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false)
; CHECK-O3-NEXT: [[TMP38:%.*]] = zext <16 x i16> [[TMP37]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP38]])
-; CHECK-O3-NEXT: [[OP_RDX_4:%.*]] = add nuw nsw i32 [[TMP39]], [[OP_RDX_3]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_4:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_3]], [[TMP38]]
; CHECK-O3-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]]
; CHECK-O3-NEXT: [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false)
; CHECK-O3-NEXT: [[TMP46:%.*]] = zext <16 x i16> [[TMP45]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP47:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP46]])
-; CHECK-O3-NEXT: [[OP_RDX_5:%.*]] = add nuw nsw i32 [[TMP47]], [[OP_RDX_4]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_5:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_4]], [[TMP46]]
; CHECK-O3-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]]
; CHECK-O3-NEXT: [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false)
; CHECK-O3-NEXT: [[TMP54:%.*]] = zext <16 x i16> [[TMP53]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP55:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP54]])
-; CHECK-O3-NEXT: [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_6:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_5]], [[TMP54]]
; CHECK-O3-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]]
; CHECK-O3-NEXT: [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false)
; CHECK-O3-NEXT: [[TMP62:%.*]] = zext <16 x i16> [[TMP61]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP63:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP62]])
-; CHECK-O3-NEXT: [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_7:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_6]], [[TMP62]]
; CHECK-O3-NEXT: [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]]
; CHECK-O3-NEXT: [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false)
; CHECK-O3-NEXT: [[TMP70:%.*]] = zext <16 x i16> [[TMP69]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP71:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP70]])
-; CHECK-O3-NEXT: [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_8:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_7]], [[TMP70]]
; CHECK-O3-NEXT: [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]]
; CHECK-O3-NEXT: [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false)
; CHECK-O3-NEXT: [[TMP78:%.*]] = zext <16 x i16> [[TMP77]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP79:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP78]])
-; CHECK-O3-NEXT: [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_9:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_8]], [[TMP78]]
; CHECK-O3-NEXT: [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]]
; CHECK-O3-NEXT: [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false)
; CHECK-O3-NEXT: [[TMP86:%.*]] = zext <16 x i16> [[TMP85]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP87:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP86]])
-; CHECK-O3-NEXT: [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_10:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_9]], [[TMP86]]
; CHECK-O3-NEXT: [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]]
; CHECK-O3-NEXT: [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false)
; CHECK-O3-NEXT: [[TMP94:%.*]] = zext <16 x i16> [[TMP93]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP95:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP94]])
-; CHECK-O3-NEXT: [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_11:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_10]], [[TMP94]]
; CHECK-O3-NEXT: [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]]
; CHECK-O3-NEXT: [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false)
; CHECK-O3-NEXT: [[TMP102:%.*]] = zext <16 x i16> [[TMP101]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP103:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP102]])
-; CHECK-O3-NEXT: [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_12:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_11]], [[TMP102]]
; CHECK-O3-NEXT: [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]]
; CHECK-O3-NEXT: [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false)
; CHECK-O3-NEXT: [[TMP110:%.*]] = zext <16 x i16> [[TMP109]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP111:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP110]])
-; CHECK-O3-NEXT: [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_13:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_12]], [[TMP110]]
; CHECK-O3-NEXT: [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]]
; CHECK-O3-NEXT: [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false)
; CHECK-O3-NEXT: [[TMP118:%.*]] = zext <16 x i16> [[TMP117]] to <16 x i32>
-; CHECK-O3-NEXT: [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]])
-; CHECK-O3-NEXT: [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]]
+; CHECK-O3-NEXT: [[VEC_SUM_NEXT_14:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_13]], [[TMP118]]
; CHECK-O3-NEXT: [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
-; CHECK-O3-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16>
+; CHECK-O3-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
; CHECK-O3-NEXT: [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-O3-NEXT: [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16>
; CHECK-O3-NEXT: [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]]
; CHECK-O3-NEXT: [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false)
-; CHECK-O3-NEXT: [[TMP126:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT: [[TMP111:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32>
+; CHECK-O3-NEXT: [[TMP126:%.*]] = add <16 x i32> [[VEC_SUM_NEXT_14]], [[TMP111]]
; CHECK-O3-NEXT: [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP126]])
-; CHECK-O3-NEXT: [[OP_RDX_15:%.*]] = add i32 [[TMP127]], [[OP_RDX_14]]
-; CHECK-O3-NEXT: ret i32 [[OP_RDX_15]]
+; CHECK-O3-NEXT: ret i32 [[TMP127]]
;
; CHECK-LTO-LABEL: define dso_local i32 @test(
; CHECK-LTO-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-LTO-NEXT: [[ENTRY:.*:]]
-; CHECK-LTO-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
-; CHECK-LTO-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
; CHECK-LTO-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[CHAR_TBAA0:![0-9]+]]
; CHECK-LTO-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]]
; CHECK-LTO-NEXT: [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true)
-; CHECK-LTO-NEXT: [[TMP36:%.*]] = zext nneg <16 x i16> [[TMP5]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]])
+; CHECK-LTO-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64
; CHECK-LTO-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16>
+; CHECK-LTO-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64
+; CHECK-LTO-NEXT: [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]]
; CHECK-LTO-NEXT: [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true)
-; CHECK-LTO-NEXT: [[TMP52:%.*]] = zext nneg <16 x i16> [[TMP11]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP60:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP52]])
-; CHECK-LTO-NEXT: [[OP_RDX_1:%.*]] = add nuw nsw i32 [[TMP60]], [[TMP44]]
+; CHECK-LTO-NEXT: [[NARROW:%.*]] = add nuw nsw <16 x i16> [[TMP11]], [[TMP5]]
; CHECK-LTO-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]]
; CHECK-LTO-NEXT: [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true)
-; CHECK-LTO-NEXT: [[TMP68:%.*]] = zext nneg <16 x i16> [[TMP17]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP76:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]])
-; CHECK-LTO-NEXT: [[OP_RDX_2:%.*]] = add nuw nsw i32 [[OP_RDX_1]], [[TMP76]]
+; CHECK-LTO-NEXT: [[NARROW15:%.*]] = add nuw nsw <16 x i16> [[NARROW]], [[TMP17]]
; CHECK-LTO-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]]
; CHECK-LTO-NEXT: [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true)
-; CHECK-LTO-NEXT: [[TMP84:%.*]] = zext nneg <16 x i16> [[TMP23]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP92:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]])
-; CHECK-LTO-NEXT: [[OP_RDX_3:%.*]] = add nuw nsw i32 [[OP_RDX_2]], [[TMP92]]
+; CHECK-LTO-NEXT: [[NARROW16:%.*]] = add nuw nsw <16 x i16> [[NARROW15]], [[TMP23]]
; CHECK-LTO-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]]
; CHECK-LTO-NEXT: [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true)
-; CHECK-LTO-NEXT: [[TMP100:%.*]] = zext nneg <16 x i16> [[TMP29]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP108:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP100]])
-; CHECK-LTO-NEXT: [[OP_RDX_4:%.*]] = add nuw nsw i32 [[OP_RDX_3]], [[TMP108]]
+; CHECK-LTO-NEXT: [[NARROW17:%.*]] = add nuw nsw <16 x i16> [[NARROW16]], [[TMP29]]
; CHECK-LTO-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]]
; CHECK-LTO-NEXT: [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true)
-; CHECK-LTO-NEXT: [[TMP116:%.*]] = zext nneg <16 x i16> [[TMP35]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP117:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP116]])
-; CHECK-LTO-NEXT: [[OP_RDX_5:%.*]] = add nuw nsw i32 [[OP_RDX_4]], [[TMP117]]
+; CHECK-LTO-NEXT: [[NARROW18:%.*]] = add nuw nsw <16 x i16> [[NARROW17]], [[TMP35]]
; CHECK-LTO-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]]
-; CHECK-LTO-NEXT: [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
+; CHECK-LTO-NEXT: [[TMP44:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true)
+; CHECK-LTO-NEXT: [[TMP42:%.*]] = add nuw nsw <16 x i16> [[TMP44]], [[NARROW18]]
; CHECK-LTO-NEXT: [[TMP43:%.*]] = zext nneg <16 x i16> [[TMP42]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP118:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP43]])
-; CHECK-LTO-NEXT: [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]]
; CHECK-LTO-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]]
; CHECK-LTO-NEXT: [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true)
; CHECK-LTO-NEXT: [[TMP51:%.*]] = zext nneg <16 x i16> [[TMP50]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP120:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP51]])
-; CHECK-LTO-NEXT: [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_7:%.*]] = add nuw nsw <16 x i32> [[TMP43]], [[TMP51]]
; CHECK-LTO-NEXT: [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]]
; CHECK-LTO-NEXT: [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true)
; CHECK-LTO-NEXT: [[TMP59:%.*]] = zext nneg <16 x i16> [[TMP58]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP121:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP59]])
-; CHECK-LTO-NEXT: [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_8:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_7]], [[TMP59]]
; CHECK-LTO-NEXT: [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]]
; CHECK-LTO-NEXT: [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true)
; CHECK-LTO-NEXT: [[TMP67:%.*]] = zext nneg <16 x i16> [[TMP66]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP122:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP67]])
-; CHECK-LTO-NEXT: [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_9:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_8]], [[TMP67]]
; CHECK-LTO-NEXT: [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]]
; CHECK-LTO-NEXT: [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true)
; CHECK-LTO-NEXT: [[TMP75:%.*]] = zext nneg <16 x i16> [[TMP74]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP123:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP75]])
-; CHECK-LTO-NEXT: [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_10:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_9]], [[TMP75]]
; CHECK-LTO-NEXT: [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]]
; CHECK-LTO-NEXT: [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true)
; CHECK-LTO-NEXT: [[TMP83:%.*]] = zext nneg <16 x i16> [[TMP82]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP124:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP83]])
-; CHECK-LTO-NEXT: [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_11:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_10]], [[TMP83]]
; CHECK-LTO-NEXT: [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]]
; CHECK-LTO-NEXT: [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true)
; CHECK-LTO-NEXT: [[TMP91:%.*]] = zext nneg <16 x i16> [[TMP90]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP125:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP91]])
-; CHECK-LTO-NEXT: [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_12:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_11]], [[TMP91]]
; CHECK-LTO-NEXT: [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]]
; CHECK-LTO-NEXT: [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true)
; CHECK-LTO-NEXT: [[TMP99:%.*]] = zext nneg <16 x i16> [[TMP98]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP126:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]])
-; CHECK-LTO-NEXT: [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_13:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_12]], [[TMP99]]
; CHECK-LTO-NEXT: [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]]
; CHECK-LTO-NEXT: [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true)
; CHECK-LTO-NEXT: [[TMP107:%.*]] = zext nneg <16 x i16> [[TMP106]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP107]])
-; CHECK-LTO-NEXT: [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_14:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_13]], [[TMP107]]
; CHECK-LTO-NEXT: [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]]
-; CHECK-LTO-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16>
+; CHECK-LTO-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]]
; CHECK-LTO-NEXT: [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[CHAR_TBAA0]]
; CHECK-LTO-NEXT: [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16>
; CHECK-LTO-NEXT: [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]]
; CHECK-LTO-NEXT: [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true)
; CHECK-LTO-NEXT: [[TMP115:%.*]] = zext nneg <16 x i16> [[TMP114]] to <16 x i32>
-; CHECK-LTO-NEXT: [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]])
-; CHECK-LTO-NEXT: [[OP_RDX_15:%.*]] = add i32 [[OP_RDX_14]], [[TMP127]]
+; CHECK-LTO-NEXT: [[VEC_SUM_NEXT_15:%.*]] = add nuw nsw <16 x i32> [[VEC_SUM_NEXT_14]], [[TMP115]]
+; CHECK-LTO-NEXT: [[OP_RDX_15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[VEC_SUM_NEXT_15]])
; CHECK-LTO-NEXT: ret i32 [[OP_RDX_15]]
;
entry:
More information about the llvm-commits
mailing list