[llvm] [LoopUnroll] Simplify reduction operations after a loop unroll (PR #84805)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 11 11:10:04 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ricardo Jesus (rj-jesus)
<details>
<summary>Changes</summary>
Try to simplify reductions (e.g. chains of floating-point adds) into independent operations after unrolling a loop. The idea is to break simple reduction loops such as:
```
loop:
PN = PHI [SUM2, loop], ...
X = ...
SUM1 = ADD (X, PN)
Y = ...
SUM2 = ADD (Y, SUM1)
br loop
```
into independent sums of the form:
```
loop:
PN1 = PHI [SUM1, loop], ...
PN2 = PHI [SUM2, loop], ...
X = ...
SUM1 = ADD (X, PN1)
Y = ...
SUM2 = ADD (Y, PN2)
<Reductions>
br loop
```
where `<Reductions>` are new instructions used to compute the final values of the reduction from the partial sums we introduced, in this case:
```
<Reductions> =
PN.red = ADD (PN1, PN2)
SUM1.red = ADD (SUM1, PN2)
SUM2.red = ADD (SUM1, SUM2)
```
This is a very common pattern in unrolled loops that compute dot products (for example) and can help with vectorisation.
---
Patch is 147.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/84805.diff
9 Files Affected:
- (modified) llvm/lib/Transforms/Utils/LoopUnroll.cpp (+267)
- (added) llvm/test/CodeGen/AArch64/polybench-3mm.ll (+55)
- (modified) llvm/test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll (+20)
- (modified) llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll (+12-8)
- (modified) llvm/test/Transforms/LoopUnroll/X86/znver3.ll (+409-141)
- (modified) llvm/test/Transforms/LoopUnroll/runtime-loop5.ll (+13-7)
- (modified) llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll (+13-7)
- (added) llvm/test/Transforms/LoopUnroll/simplify-reductions.ll (+301)
- (modified) llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll (+108-55)
``````````diff
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 6f0d000815726e..b14d05d642e275 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -84,6 +84,11 @@ STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
"latch (completely or otherwise)");
+static cl::opt<bool>
+UnrollSimplifyReductions("unroll-simplify-reductions", cl::init(true),
+ cl::Hidden, cl::desc("Try to simplify reductions "
+ "after unrolling a loop."));
+
static cl::opt<bool>
UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
cl::desc("Allow runtime unrolled loops to be unrolled "
@@ -209,6 +214,258 @@ static bool isEpilogProfitable(Loop *L) {
return false;
}
+/// This function tries to break apart simple reduction loops like the one
+/// below:
+///
+/// loop:
+/// PN = PHI [SUM2, loop], ...
+/// X = ...
+/// SUM1 = ADD (X, PN)
+/// Y = ...
+/// SUM2 = ADD (Y, SUM1)
+/// br loop
+///
+/// into independent sums of the form:
+///
+/// loop:
+/// PN1 = PHI [SUM1, loop], ...
+/// PN2 = PHI [SUM2, loop], ...
+/// X = ...
+/// SUM1 = ADD (X, PN1)
+/// Y = ...
+/// SUM2 = ADD (Y, PN2)
+/// <Reductions>
+/// br loop
+///
+/// where <Reductions> are new instructions inserted to compute the final
+/// values of the reduction from the partial sums we introduced, in this case:
+///
+/// <Reductions> =
+/// PN.red = ADD (PN1, PN2)
+/// SUM1.red = ADD (SUM1, PN2)
+/// SUM2.red = ADD (SUM1, SUM2)
+///
+/// In practice in most cases only one or two of the reduced values are
+/// required outside the loop so most of the reduction instructions do not
+/// need to be added into the loop. Moreover, these instructions can be sunk
+/// from the loop which happens in later passes.
+///
+/// This is a very common pattern in unrolled loops that compute dot products
+/// (for example) and breaking apart the reduction chains can help greatly with
+/// vectorisation.
+static bool trySimplifyReductions(Instruction &I) {
+ // Check if I is a PHINode (potentially the start of a reduction chain).
+ // Note: For simplicity we only consider loops that consists of a single
+ // basic block that branches to itself.
+ BasicBlock *BB = I.getParent();
+ PHINode *PN = dyn_cast<PHINode>(&I);
+ if (!PN || PN->getBasicBlockIndex(BB) == -1)
+ return false;
+
+ // Attempt to construct a list of instructions that are chained together
+ // (i.e. that perform a reduction).
+ SmallVector<BinaryOperator *, 16> Ops;
+ for (Instruction *Cur = PN, *Next = nullptr; /* true */; Cur = Next,
+ Next = nullptr) {
+ // Try to find the next element in the reduction chain.
+ for (auto *U : Cur->users()) {
+ auto *Candidate = dyn_cast<Instruction>(U);
+ if (Candidate && Candidate->getParent() == BB) {
+ // If we've already found a candidate element for the chain and we find
+ // *another* candidate we bail out as this means the intermediate
+ // values of the reduction are needed within the loop, and so there is
+ // no point in breaking the reduction apart.
+ if (Next)
+ return false;
+ Next = Candidate;
+ }
+ }
+ // If we've reached the start, i.e. the next element in the chain would be
+ // the PN we started with, we are done.
+ if (Next == PN)
+ break;
+ // Else, check if we found a candidate at all and if so if it is a binary
+ // operator.
+ if (!Next || !isa<BinaryOperator>(Next))
+ return false;
+ // If everything checks out, add the new element to the chain.
+ Ops.push_back(cast<BinaryOperator>(Next));
+ }
+
+ // Ensure the reduction comprises at least two instructions, otherwise this
+ // is a trivial reduction of a single element that doesn't need to be
+ // simplified.
+ if (Ops.size() < 2)
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "Found candidate reduction: " << I << "\n";
+ for (auto const *Op : Ops)
+ dbgs() << " | " << *Op << "\n";
+ );
+
+ // Ensure all instructions perform the same operation and that the operation
+ // is associative and commutative so that we can break the chain apart and
+ // reassociate the Ops.
+ Instruction::BinaryOps const Opcode = Ops[0]->getOpcode();
+ for (auto const *Op : Ops)
+ if (Op->getOpcode() != Opcode || !Op->isAssociative() ||
+ !Op->isCommutative())
+ return false;
+
+ // Define the neutral element of the reduction or bail out if we don't have
+ // one defined.
+ // TODO: This could be generalised to other operations (e.g. MUL's).
+ Value *NeutralElem = nullptr;
+ switch (Opcode) {
+ case Instruction::BinaryOps::Add:
+ case Instruction::BinaryOps::Or:
+ case Instruction::BinaryOps::Xor:
+ case Instruction::BinaryOps::FAdd:
+ NeutralElem = Constant::getNullValue(PN->getType());
+ break;
+ case Instruction::BinaryOps::And:
+ NeutralElem = Constant::getAllOnesValue(PN->getType());
+ break;
+ case Instruction::BinaryOps::Mul:
+ case Instruction::BinaryOps::FMul:
+ default:
+ return false;
+ }
+ assert(NeutralElem && "Neutral element of reduction undefined.");
+
+ // --------------------------------------------------------------------- //
+ // At this point Ops is a list of chained binary operations performing a //
+ // reduction that we know we can break apart. //
+ // --------------------------------------------------------------------- //
+
+ // For shorthand, let N be the length of the chain.
+ unsigned const N = Ops.size();
+ LLVM_DEBUG(dbgs() << "Simplifying reduction of length " << N << ".\n");
+
+ // Create new phi nodes for all but the first element in the chain.
+ SmallVector<PHINode *, 16> Phis{PN};
+ for (unsigned i = 1; i < N; i++) {
+ PHINode *NewPN = PHINode::Create(PN->getType(), PN->getNumIncomingValues(),
+ PN->getName());
+ // Copy incoming blocks from the first/original PN to the new Phi and set
+ // their incoming values to the neutral element of the reduction.
+ for (auto *IncomingBB : PN->blocks())
+ NewPN->addIncoming(NeutralElem, IncomingBB);
+ NewPN->insertAfter(Phis.back());
+ Phis.push_back(NewPN);
+ }
+
+ // Set the chained operands of the Ops to the Phis and the incoming values of
+ // the Phis (for this BB) to the Ops.
+ for (unsigned i = 0; i < N; i++) {
+ PHINode *Phi = Phis[i];
+ Instruction *Op = Ops[i];
+
+ // Find the index of the operand of Op to replace. The first Op reads its
+ // value from the first Phi node. The other Ops read their value from the
+ // previous Op.
+ Value *OperandToReplace = i == 0 ? cast<Value>(PN) : Ops[i-1];
+ unsigned OperandIdx = Op->getOperand(0) == OperandToReplace ? 0 : 1;
+ assert(Op->getOperand(OperandIdx) == OperandToReplace &&
+ "Operand mismatch. Perhaps a malformed chain?");
+
+ // Set the operand of Op to Phi and the incoming value of Phi for BB to Op.
+ Op->setOperand(OperandIdx, Phi);
+ Phi->setIncomingValueForBlock(BB, Op);
+ }
+
+ // Replace old uses of PN and Ops outside this BB with the updated totals.
+ // The "old" total corresponding to PN now corresponds to the sum of all
+ // Phis. Similarly, the old totals in Ops correspond to the sum of the
+ // partial results in the new Ops up to the index of the Op we want to
+ // compute, plus the sum of the Phis from that index onwards.
+ //
+ // More rigorously, the totals can be computed as follows.
+ // 1. Let k be an index in the list of length N+1 below of the variables we
+ // want to compute the new totals for:
+ // { PN, Ops[0], Ops[1], ... }
+ // 2. Let Sum(k) denote the new total to compute for the k-th variable in the
+ // list above. Then,
+ // Sum(0) = Sum(PN) = \sum_{0 <= i < N} Phis[i],
+ // Sum(1) = Sum(Ops[0]) = \sum_{0 <= i < 1} Ops[i] +
+ // \sum_{1 <= i < N} Phis[i],
+ // ...
+ // Sum(N) = Sum(Ops[N-1]) = \sum_{0 <= i < N} Ops[i].
+ // 3. More generally,
+ // Sum(k) = Sum(PN) if k == 0 else Sum(Ops[k-1])
+ // = \sum_{0 <= i < k} Ops[i] +
+ // \sum_{k <= i < N} Phis[i],
+ // for 0 <= k <= N.
+ // 4. Finally, if we name the sums in Ops and Phis separately, i.e.
+ // SOps(k) = \sum_{0 <= i < k} Ops[i],
+ // SPhis(k) = \sum_{k <= i < N} Phis[i],
+ // then
+ // Sum(k) = SOps(k) + SPhis(k), 0 <= k <= N.
+ // .
+
+ // Helper function to create a new binary op.
+ // Note: We copy the flags from Ops[0]. Could this be too permissive?
+ auto CreateBinOp = [&](Value *V1, Value *V2) {
+ auto Name = PN->getName()+".red";
+ return BinaryOperator::CreateWithCopiedFlags(Opcode, V1, V2, Ops[0],
+ Name, &BB->back());
+ };
+
+ // Compute the partial sums of the Ops:
+ // SOps[k] = \sum_{0 <= i < k} Ops[i], 0 <= k <= N.
+ // For 1 <= k <= N we have:
+ // SOps[k] = Ops[k-1] + \sum_{0 <= i < k-1} Ops[i]
+ // = Ops[k-1] + SOps[k-1],
+ // so if we compute SOps in order (i.e. from 0 to N) we can reuse partial
+ // results.
+ SmallVector<Value *, 16> SOps(N+1);
+ SOps[0] = nullptr; // alternatively we could use NeutralElem
+ SOps[1] = Ops.front();
+ for (unsigned k = 2; k <= N; k++)
+ SOps[k] = CreateBinOp(SOps[k-1], Ops[k-1]);
+
+ // Compute the partial sums of the Phis:
+ // SPhis[k] = \sum_{k <= i < N} Phis[i], 0 <= k <= N.
+ // Similarly, for 0 <= k <= N-1 we have:
+ // SPhis[k] = Phis[k] + \sum_{k+1 <= i < N} Phis[i]
+ // = Phis[k] + SPhis[k+1],
+ // so if we compute SPhis in reverse (i.e. from N down to 0) we can reuse the
+ // partial sums computed thus far.
+ SmallVector<Value *, 16> SPhis(N+1);
+ SPhis[N] = nullptr; // alternatively we could use NeutralElem
+ SPhis[N-1] = Phis.back();
+ for (signed k = N-2; k >= 0; k--)
+ SPhis[k] = CreateBinOp(SPhis[k+1], Phis[k]);
+
+ // Finally, compute the total sums for PN and Ops from:
+ // Sums[k] = SOps[k] + SPhis[k], 0 <= k <= N.
+ // These sums might be dead so we had them to a weak tracking vector for
+ // cleanup after.
+ SmallVector<WeakTrackingVH, 16> Sums(N+1);
+ for (unsigned k = 0; k <= N; k++) {
+ // Pick the Op we want to compute the new total for.
+ Value *Op = k == 0 ? cast<Value>(PN) : Ops[k-1];
+
+ Value *SOp = SOps[k], *SPhi = SPhis[k];
+ if (SOp && SPhi)
+ Sums[k] = CreateBinOp(SOp, SPhi);
+ else if (SOp)
+ Sums[k] = SOp;
+ else
+ Sums[k] = SPhi;
+
+ // Replace uses of the old total with the new total.
+ Op->replaceUsesOutsideBlock(Sums[k], BB);
+ }
+
+ // Drop dead totals. In case the totals *are* used they could and should be
+ // sunk, but this happens in later passes so we don't bother doing it here.
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(Sums);
+
+ return true;
+}
+
/// Perform some cleanup and simplifications on loops after unrolling. It is
/// useful to simplify the IV's in the new loop, as well as do a quick
/// simplify/dce pass of the instructions.
@@ -272,6 +529,16 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
// have a phi which (potentially indirectly) uses instructions later in
// the block we're iterating through.
RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+ // Try to simplify reductions (e.g. chains of floating-point adds) into
+ // independent operations (see more at trySimplifyReductions). This is a
+ // very common pattern in unrolled loops that compute dot products (for
+ // example).
+ //
+ // We do this outside the loop over the instructions above to let
+ // instsimplify kick in before trying to apply this transform.
+ if (UnrollSimplifyReductions)
+ for (PHINode &PN : BB->phis())
+ trySimplifyReductions(PN);
}
}
diff --git a/llvm/test/CodeGen/AArch64/polybench-3mm.ll b/llvm/test/CodeGen/AArch64/polybench-3mm.ll
new file mode 100644
index 00000000000000..309fa0e9a305f7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/polybench-3mm.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: opt -passes=loop-unroll,instcombine -unroll-count=2 %s | llc --mattr=,+neon | FileCheck %s
+
+target triple = "aarch64"
+
+; This is a reduced example adapted from the Polybench 3MM kernel.
+; We are doing something similar to:
+; double dot = 0.0;
+; for (long k = 0; k < 1000; k++)
+; dot += A[k] * B[k*nb];
+; return dot;
+
+define double @test(ptr %A, ptr %B, i64 %nb) {
+; CHECK-LABEL: test:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: lsl x8, x2, #4
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: .LBB0_1: // %loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x10, x0, x9, lsl #3
+; CHECK-NEXT: ldr d2, [x1]
+; CHECK-NEXT: ldr d5, [x1, x2, lsl #3]
+; CHECK-NEXT: add x9, x9, #2
+; CHECK-NEXT: add x1, x1, x8
+; CHECK-NEXT: ldp d3, d4, [x10]
+; CHECK-NEXT: cmp x9, #1000
+; CHECK-NEXT: fmadd d0, d2, d3, d0
+; CHECK-NEXT: fmadd d1, d5, d4, d1
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %exit
+; CHECK-NEXT: fadd d0, d0, d1
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+
+loop:
+ %k = phi i64 [ %k.next, %loop ], [ 0, %entry ]
+ %dot = phi double [ %dot.next, %loop ], [ 0.000000e+00, %entry ]
+ %A.gep = getelementptr inbounds double, ptr %A, i64 %k
+ %A.val = load double, ptr %A.gep, align 8
+ %B.idx = mul nsw i64 %k, %nb
+ %B.gep = getelementptr inbounds double, ptr %B, i64 %B.idx
+ %B.val = load double, ptr %B.gep, align 8
+ %fmul = fmul fast double %B.val, %A.val
+ %dot.next = fadd fast double %fmul, %dot
+ %k.next = add nuw nsw i64 %k, 1
+ %cmp = icmp eq i64 %k.next, 1000
+ br i1 %cmp, label %exit, label %loop
+
+exit:
+ %dot.next.lcssa = phi double [ %dot.next, %loop ]
+ ret double %dot.next.lcssa
+}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll b/llvm/test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll
index 045b1c72321a97..874a4aa800c411 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/falkor-prefetch.ll
@@ -73,6 +73,13 @@ exit:
; NOHWPF-LABEL: loop2:
; NOHWPF-NEXT: phi
; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
+; NOHWPF-NEXT: phi
; NOHWPF-NEXT: getelementptr
; NOHWPF-NEXT: load
; NOHWPF-NEXT: add
@@ -106,6 +113,13 @@ exit:
; NOHWPF-NEXT: add
; NOHWPF-NEXT: add
; NOHWPF-NEXT: icmp
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
+; NOHWPF-NEXT: add
; NOHWPF-NEXT: br
; NOHWPF-NEXT-LABEL: exit2:
;
@@ -113,6 +127,9 @@ exit:
; CHECK-LABEL: loop2:
; CHECK-NEXT: phi
; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
; CHECK-NEXT: getelementptr
; CHECK-NEXT: load
; CHECK-NEXT: add
@@ -130,6 +147,9 @@ exit:
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: icmp
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
; CHECK-NEXT: br
; CHECK-NEXT-LABEL: exit2:
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
index 216bf489bc66ec..59208a6da76f62 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
@@ -195,14 +195,15 @@ define i32 @test_i32_select_optsize(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-V8-NEXT: br label [[LOOP:%.*]]
; CHECK-V8: loop:
; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
-; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
+; CHECK-V8-NEXT: [[ACC1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ]
; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i32, ptr [[B:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i32, ptr [[ADDR_A]], align 4
; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i32, ptr [[ADDR_B]], align 4
; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i32 [[DATA_A]], [[DATA_B]]
; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i32 [[DATA_A]], i32 [[DATA_B]]
-; CHECK-V8-NEXT: [[ACC_NEXT:%.*]] = add i32 [[UMAX]], [[ACC]]
+; CHECK-V8-NEXT: [[ACC_NEXT]] = add i32 [[UMAX]], [[ACC]]
; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i32, ptr [[C:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: store i32 [[UMAX]], ptr [[ADDR_C]], align 4
; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
@@ -212,14 +213,15 @@ define i32 @test_i32_select_optsize(ptr %a, ptr %b, ptr %c) #0 {
; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i32, ptr [[ADDR_B_1]], align 4
; CHECK-V8-NEXT: [[UGT_1:%.*]] = icmp ugt i32 [[DATA_A_1]], [[DATA_B_1]]
; CHECK-V8-NEXT: [[UMAX_1:%.*]] = select i1 [[UGT_1]], i32 [[DATA_A_1]], i32 [[DATA_B_1]]
-; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC_NEXT]]
+; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC1]]
; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i32, ptr [[C]], i32 [[COUNT]]
; CHECK-V8-NEXT: store i32 [[UMAX_1]], ptr [[ADDR_C_1]], align 4
; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
; CHECK-V8-NEXT: [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
+; CHECK-V8-NEXT: [[ACC_RED:%.*]] = add i32 [[ACC_NEXT]], [[ACC_NEXT_1]]
; CHECK-V8-NEXT: br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
; CHECK-V8: exit:
-; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi i32 [ [[ACC_NEXT_1]], [[LOOP]] ]
+; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi i32 [ [[ACC_RED]], [[LOOP]] ]
; CHECK-V8-NEXT: ret i32 [[ACC_NEXT_LCSSA]]
;
entry:
@@ -251,14 +253,15 @@ define i32 @test_i32_select_minsize(ptr %a, ptr %b, ptr %c) #1 {
; CHECK-V8-NEXT: br label [[LOOP:%.*]]
; CHECK-V8: loop:
; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
-; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
+; CHECK-V8-NEXT: [[ACC1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ]
; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i32, ptr [[B:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i32, ptr [[ADDR_A]], align 4
; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i32, ptr [[ADDR_B]], align 4
; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i32 [[DATA_A]], [[DATA_B]]
; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i32 [[DATA_A]], i32 [[DATA_B]]
-; CHECK-V8-NEXT: [[ACC_NEXT:%.*]] = add i32 [[UMAX]], [[ACC]]
+; CHECK-V8-NEXT: [[ACC_NEXT]] = add i32 [[UMAX]], [[ACC]]
; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i32, ptr [[C:%.*]], i32 [[IV]]
; CHECK-V8-NEXT: store i32 [[UMAX]], ptr [[ADDR_C]], align 4
; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
@@ -268,14 +271,15 @@ define i32 @test_i32_select_minsize(ptr %a, ptr %b, ptr %c) #1 {
; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i32, ptr [[ADDR_B_1]], align 4
; CHECK-V8-NEXT: [[UGT_1:%.*]] = icmp ugt i32 [[DATA_A_1]], [[DATA_B_1]]
; CHECK-V8-NEXT: [[UMAX_1:%.*]] = select i1 [[UGT_1]], i32 [[DATA_A_1]], i32 [[DATA_B_1]]
-; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC_NEXT]]
+; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC1]]
; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i32, ptr [[C]], i32 [[COUNT]]
; CHECK-V8-NEXT: store i32 [[UMAX_1]], ptr [[ADDR_C_1]], align 4
; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
; CHECK-V8-NEXT:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/84805
More information about the llvm-commits
mailing list