[llvm-branch-commits] [llvm] PR for llvm/llvm-project#79800 (PR #79813)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 29 03:52:33 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: None (llvmbot)
<details>
<summary>Changes</summary>
resolves llvm/llvm-project#<!-- -->79800
---
Full diff: https://github.com/llvm/llvm-project/pull/79813.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+56-6)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll (+217)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca93e15719fb27..dd596c567cd4824 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
bool CostTooHigh = false;
const bool AddBranchWeights;
+ Loop *OuterLoop = nullptr;
+
public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
DT->eraseNode(SCEVCheckBlock);
LI->removeBlock(SCEVCheckBlock);
}
+
+ // Outer loop is used as part of the later cost calculations.
+ OuterLoop = L->getParentLoop();
}
InstructionCost getCost() {
@@ -2076,16 +2081,61 @@ class GeneratedRTChecks {
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
- if (MemCheckBlock)
+ if (MemCheckBlock) {
+ InstructionCost MemCheckCost = 0;
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
InstructionCost C =
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
- RTCheckCost += C;
+ MemCheckCost += C;
}
+ // If the runtime memory checks are being created inside an outer loop
+ // we should find out if these checks are outer loop invariant. If so,
+ // the checks will likely be hoisted out and so the effective cost will
+ // reduce according to the outer loop trip count.
+ if (OuterLoop) {
+ ScalarEvolution *SE = MemCheckExp.getSE();
+ // TODO: If profitable, we could refine this further by analysing every
+ // individual memory check, since there could be a mixture of loop
+ // variant and invariant checks that mean the final condition is
+ // variant.
+ const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+ if (SE->isLoopInvariant(Cond, OuterLoop)) {
+ // It seems reasonable to assume that we can reduce the effective
+ // cost of the checks even when we know nothing about the trip
+ // count. Assume that the outer loop executes at least twice.
+ unsigned BestTripCount = 2;
+
+ // If exact trip count is known use that.
+ if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+ BestTripCount = SmallTC;
+ else if (LoopVectorizeWithBlockFrequency) {
+ // Else use profile data if available.
+ if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+ BestTripCount = *EstimatedTC;
+ }
+
+ InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+ // Let's ensure the cost is always at least 1.
+ NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
+ (InstructionCost::CostType)1);
+
+ LLVM_DEBUG(dbgs()
+ << "We expect runtime memory checks to be hoisted "
+ << "out of the outer loop. Cost reduced from "
+ << MemCheckCost << " to " << NewMemCheckCost << '\n');
+
+ MemCheckCost = NewMemCheckCost;
+ }
+ }
+
+ RTCheckCost += MemCheckCost;
+ }
+
if (SCEVCheckBlock || MemCheckBlock)
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
<< "\n");
@@ -2144,8 +2194,8 @@ class GeneratedRTChecks {
BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
// Create new preheader for vector loop.
- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
- PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
+ if (OuterLoop)
+ OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
SCEVCheckBlock->getTerminator()->eraseFromParent();
SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
@@ -2179,8 +2229,8 @@ class GeneratedRTChecks {
DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
MemCheckBlock->moveBefore(LoopVectorPreHeader);
- if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
- PL->addBasicBlockToLoop(MemCheckBlock, *LI);
+ if (OuterLoop)
+ OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
BranchInst &BI =
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 000000000000000..8a796bb3065b196
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,217 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
+; CHECK: Calculating cost of runtime checks:
+; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop.
+; CHECK: Total cost of runtime checks: 4
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+ br label %inner.loop
+
+inner.loop:
+ %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
+ %add.us = add nuw nsw i64 %inner.iv, %off
+ %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+ %1 = load i8, ptr %arrayidx7.us, align 1
+ %add9.us = add i8 %1, %0
+ store i8 %add9.us, ptr %arrayidx7.us, align 1
+ %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+ %exitcond.not = icmp eq i64 %inner.iv.next, %n
+ br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+ ret void
+}
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK: Total cost of runtime checks: 3
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+ br label %outer.loop
+
+outer.loop:
+ %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+ %mul.us = mul nsw i64 %outer.iv, %n
+ br label %inner.loop
+
+inner.loop:
+ %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+ %add.us = add nuw nsw i64 %inner.iv, %mul.us
+ %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+ %1 = load i8, ptr %arrayidx7.us, align 1
+ %add9.us = add i8 %1, %0
+ store i8 %add9.us, ptr %arrayidx7.us, align 1
+ %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+ %exitcond.not = icmp eq i64 %inner.iv.next, %n
+ br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+ %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+ %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+ br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+ ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK: Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+ br label %outer.loop
+
+outer.loop:
+ %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+ %mul.us = mul nsw i64 %outer.iv, %n
+ br label %inner.loop
+
+inner.loop:
+ %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+ %add.us = add nuw nsw i64 %inner.iv, %mul.us
+ %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+ %1 = load i8, ptr %arrayidx7.us, align 1
+ %add9.us = add i8 %1, %0
+ store i8 %add9.us, ptr %arrayidx7.us, align 1
+ %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+ %exitcond.not = icmp eq i64 %inner.iv.next, %n
+ br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+ %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+ %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+ br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+ ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
+; CHECK: Total cost of runtime checks: 1
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+ br label %outer.loop
+
+outer.loop:
+ %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+ %mul.us = mul nsw i64 %outer.iv, %n
+ br label %inner.loop
+
+inner.loop:
+ %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+ %add.us = add nuw nsw i64 %inner.iv, %mul.us
+ %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+ %1 = load i8, ptr %arrayidx7.us, align 1
+ %add9.us = add i8 %1, %0
+ store i8 %add9.us, ptr %arrayidx7.us, align 1
+ %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+ %exitcond.not = icmp eq i64 %inner.iv.next, %n
+ br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+ %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+ %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+ br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+ ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK: Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+ br label %outer.loop
+
+outer.loop:
+ %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+ %mul.us = mul nsw i64 %outer.iv, %n
+ br label %inner.loop
+
+inner.loop:
+ %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+ %add.us = add nuw nsw i64 %inner.iv, %mul.us
+ %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+ %0 = load i8, ptr %arrayidx.us, align 1
+ %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+ %1 = load i8, ptr %arrayidx7.us, align 1
+ %add9.us = add i8 %1, %0
+ store i8 %add9.us, ptr %arrayidx7.us, align 1
+ %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+ %exitcond.not = icmp eq i64 %inner.iv.next, %n
+ br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+ %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+ %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+ br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+ ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK: Calculating cost of runtime checks:
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK: Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
+entry:
+ br label %outer.loop
+
+outer.loop:
+ %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+ %0 = mul nsw i64 %outer.iv, %n
+ br label %inner.loop
+
+inner.loop:
+ %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+ %1 = add nuw nsw i64 %iv.inner, %0
+ %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+ %2 = load i32, ptr %arrayidx.us, align 4
+ %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+ %3 = load i32, ptr %arrayidx8.us, align 4
+ %add9.us = add nsw i32 %3, %2
+ store i32 %add9.us, ptr %arrayidx8.us, align 4
+ %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+ %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+ br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+ %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+ %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+ br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+ ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}
``````````
</details>
https://github.com/llvm/llvm-project/pull/79813
More information about the llvm-branch-commits
mailing list