[llvm] [LSR] Don't count conditional loads/store as enabling pre/post-index (PR #159573)
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 6 10:17:52 PDT 2025
https://github.com/john-brawn-arm updated https://github.com/llvm/llvm-project/pull/159573
>From e093aa2d395f53e4f6d51afc383d8697536d52ae Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 19 Jun 2025 11:08:35 +0100
Subject: [PATCH 1/5] [LSR] Don't count conditional loads/store as enabling
pre/post-index
When a load/store is conditionally executed in a loop it isn't a
candidate for pre/post-index addressing, as the increment of the
address would only happen on those loop iterations where the
load/store is executed.
Detect this and only discount the AddRec cost when the load/store is
unconditional.
---
.../Transforms/Scalar/LoopStrengthReduce.cpp | 32 ++++++++++++++++---
.../LoopStrengthReduce/AArch64/prefer-all.ll | 6 ++--
2 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 1a279b6198182..22ab5820b7ec1 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1278,6 +1278,7 @@ struct LSRFixup {
LSRFixup() = default;
bool isUseFullyOutsideLoop(const Loop *L) const;
+ bool isUseUnconditional(const Loop *L) const;
void print(raw_ostream &OS) const;
void dump() const;
@@ -1318,6 +1319,11 @@ class LSRUse {
/// the loop, in which case some special-case heuristics may be used.
bool AllFixupsOutsideLoop = true;
+ /// This records whether all of the fixups using this LSRUse are unconditional
+ /// within the loop, meaning they will be executed in every iteration of the
+ /// loop.
+ bool AllFixupsUnconditional = true;
+
/// RigidFormula is set to true to guarantee that this use will be associated
/// with a single formula--the one that initially matched. Some SCEV
/// expressions cannot be expanded. This allows LSR to consider the registers
@@ -1422,15 +1428,19 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
const SCEV *Start;
const SCEVConstant *Step;
- if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
+ if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
- if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
- Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
- ((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L)))
+ bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
+ Step->getAPInt() == F.BaseOffset.getFixedValue();
+ bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
+ SE->isLoopInvariant(Start, L);
+ // We can only pre or post index when the load/store is unconditional.
+ if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
LoopCost = 0;
+ }
}
+
// If the loop counts down to zero and we'll be using a hardware loop then
// the addrec will be combined into the hardware loop instruction.
if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
@@ -1647,6 +1657,12 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
return !L->contains(UserInst);
}
+/// Test whether this fixup is for an instruction that's unconditional, i.e.
+/// it's executed in every loop iteration.
+bool LSRFixup::isUseUnconditional(const Loop *L) const {
+ return isGuaranteedToExecuteForEveryIteration(UserInst, L);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRFixup::print(raw_ostream &OS) const {
OS << "UserInst=";
@@ -1783,6 +1799,9 @@ void LSRUse::print(raw_ostream &OS) const {
if (AllFixupsOutsideLoop)
OS << ", all-fixups-outside-loop";
+ if (AllFixupsUnconditional)
+ OS << ", all-fixups-unconditional";
+
if (WidestFixupType)
OS << ", widest fixup type: " << *WidestFixupType;
}
@@ -3607,6 +3626,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);
// Create SCEV as Formula for calculating baseline cost
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
@@ -3803,6 +3823,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
LF.OperandValToReplace = U;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
@@ -4940,6 +4961,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+ LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
// Transfer the fixups of LU to LUThatHas.
for (LSRFixup &Fixup : LU.Fixups) {
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
index db30fd23b0c9d..065a6c8b980f8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
@@ -119,8 +119,6 @@ for.end:
; We can't use postindex addressing on the conditional load of qval and can't
; convert the loop condition to a compare with zero, so we should instead use
; offset addressing.
-; FIXME: Currently we don't notice the load of qval is conditional, and attempt
-; postindex addressing anyway.
define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-LABEL: define i32 @conditional_load(
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
@@ -128,7 +126,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
-; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
@@ -136,6 +133,8 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 2
+; CHECK-NEXT: [[LSR_IV:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP0]]
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
; CHECK-NEXT: br label %[[FOR_INC]]
@@ -143,7 +142,6 @@ define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
-; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
>From cf3be4e30f5b6bf4f8e17c490375729912f1879a Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 18 Sep 2025 14:34:12 +0100
Subject: [PATCH 2/5] clang-format
---
llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 22ab5820b7ec1..26d8e6127b370 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1431,10 +1431,12 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
- bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
- Step->getAPInt() == F.BaseOffset.getFixedValue();
- bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
- SE->isLoopInvariant(Start, L);
+ bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
+ F.BaseOffset.isFixed() &&
+ Step->getAPInt() == F.BaseOffset.getFixedValue();
+ bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
+ !isa<SCEVConstant>(Start) &&
+ SE->isLoopInvariant(Start, L);
// We can only pre or post index when the load/store is unconditional.
if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
LoopCost = 0;
>From 01c8ed1a0e47ffaa7ffe50f913477d2ad9357c20 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 18 Sep 2025 15:48:58 +0100
Subject: [PATCH 3/5] Use m_scev_APInt
---
llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 26d8e6127b370..23b2bc70f560e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1427,13 +1427,13 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
const SCEV *Start;
- const SCEVConstant *Step;
- if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step)))) {
+ const APInt *Step;
+ if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
// If the step size matches the base offset, we could use pre-indexed
// addressing.
bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
F.BaseOffset.isFixed() &&
- Step->getAPInt() == F.BaseOffset.getFixedValue();
+ *Step == F.BaseOffset.getFixedValue();
bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
!isa<SCEVConstant>(Start) &&
SE->isLoopInvariant(Start, L);
>From 671aa5c879af5ae234f827a58c49829932bc6039 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 18 Sep 2025 16:09:39 +0100
Subject: [PATCH 4/5] Update minloop.ll test
---
.../Thumb2/LowOverheadLoops/minloop.ll | 70 ++++++++++---------
1 file changed, 37 insertions(+), 33 deletions(-)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index 9c36bae6fac13..ec257bcf123f3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -6,77 +6,81 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: subs.w r9, r1, #1
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: and r8, r9, #3
+; CHECK-NEXT: and r6, r9, #3
; CHECK-NEXT: subs r7, r1, #2
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: bhs .LBB0_4
; CHECK-NEXT: @ %bb.2:
-; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: cbnz r6, .LBB0_7
+; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new
; CHECK-NEXT: bic r7, r9, #3
-; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: str r6, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r7, #4
+; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: add.w lr, r6, r7, lsr #2
-; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: movs r7, #4
; CHECK-NEXT: .LBB0_5: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr r10, [r0, #16]!
-; CHECK-NEXT: sub.w r9, r9, #4
-; CHECK-NEXT: ldrd r5, r4, [r0, #-12]
-; CHECK-NEXT: ldr r11, [r0, #-4]
+; CHECK-NEXT: ldr r11, [r0, #16]!
+; CHECK-NEXT: ldrd r5, r7, [r0, #-12]
+; CHECK-NEXT: ldr r4, [r0, #-4]
; CHECK-NEXT: cmp r12, r5
-; CHECK-NEXT: it gt
-; CHECK-NEXT: subgt r6, r7, #3
; CHECK-NEXT: csel r5, r5, r12, gt
-; CHECK-NEXT: cmp r5, r4
+; CHECK-NEXT: csinc r6, r10, r8, le
+; CHECK-NEXT: cmp r5, r7
; CHECK-NEXT: it gt
-; CHECK-NEXT: subgt r6, r7, #2
-; CHECK-NEXT: csel r5, r4, r5, gt
-; CHECK-NEXT: cmp r5, r11
+; CHECK-NEXT: addgt.w r6, r8, #2
+; CHECK-NEXT: csel r7, r7, r5, gt
+; CHECK-NEXT: cmp r7, r4
; CHECK-NEXT: it gt
-; CHECK-NEXT: subgt r6, r7, #1
-; CHECK-NEXT: csel r5, r11, r5, gt
-; CHECK-NEXT: cmp r5, r10
-; CHECK-NEXT: csel r6, r7, r6, gt
-; CHECK-NEXT: add.w r7, r7, #4
-; CHECK-NEXT: csel r12, r10, r5, gt
+; CHECK-NEXT: addgt.w r6, r8, #3
+; CHECK-NEXT: csel r7, r4, r7, gt
+; CHECK-NEXT: add.w r8, r8, #4
+; CHECK-NEXT: cmp r7, r11
+; CHECK-NEXT: csel r10, r8, r6, gt
+; CHECK-NEXT: csel r12, r11, r7, gt
; CHECK-NEXT: le lr, .LBB0_5
-; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa
-; CHECK-NEXT: cmp.w r8, #0
-; CHECK-NEXT: beq .LBB0_10
-; CHECK-NEXT: @ %bb.7: @ %while.body.epil
+; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
+; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload
+; CHECK-NEXT: sub.w r9, r9, r8
+; CHECK-NEXT: cbz r6, .LBB0_10
+; CHECK-NEXT: .LBB0_7: @ %while.body.epil
; CHECK-NEXT: ldr r7, [r0, #4]
; CHECK-NEXT: sub.w r1, r1, r9
; CHECK-NEXT: cmp r12, r7
-; CHECK-NEXT: csel r6, r1, r6, gt
+; CHECK-NEXT: csel r10, r1, r10, gt
; CHECK-NEXT: csel r12, r7, r12, gt
-; CHECK-NEXT: cmp.w r8, #1
+; CHECK-NEXT: cmp r6, #1
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1
; CHECK-NEXT: ldr r7, [r0, #8]
; CHECK-NEXT: cmp r12, r7
-; CHECK-NEXT: csinc r6, r6, r1, le
+; CHECK-NEXT: csinc r10, r10, r1, le
; CHECK-NEXT: csel r12, r7, r12, gt
-; CHECK-NEXT: cmp.w r8, #2
+; CHECK-NEXT: cmp r6, #2
; CHECK-NEXT: beq .LBB0_10
; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2
; CHECK-NEXT: ldr r0, [r0, #12]
; CHECK-NEXT: cmp r12, r0
; CHECK-NEXT: it gt
-; CHECK-NEXT: addgt r6, r1, #2
+; CHECK-NEXT: addgt.w r10, r1, #2
; CHECK-NEXT: csel r12, r0, r12, gt
; CHECK-NEXT: .LBB0_10: @ %while.end
; CHECK-NEXT: str.w r12, [r2]
-; CHECK-NEXT: str r6, [r3]
+; CHECK-NEXT: str.w r10, [r3]
+; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%0 = load i32, ptr %pSrc, align 4
>From a08955ccb9f4e9adea7794882e9222baccc67b39 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Fri, 3 Oct 2025 16:03:57 +0100
Subject: [PATCH 5/5] Alternate approach to determining if fixup is always
executed
Instead of isGuaranteedToExecuteForEveryIteration, check that the block the
instruction is in dominates the block the IV increment is in.
---
.../Transforms/Scalar/LoopStrengthReduce.cpp | 20 +--
.../LoopStrengthReduce/AArch64/prefer-all.ll | 138 ++++++++++++++++++
2 files changed, 149 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 23b2bc70f560e..f02ca1367002a 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1278,7 +1278,6 @@ struct LSRFixup {
LSRFixup() = default;
bool isUseFullyOutsideLoop(const Loop *L) const;
- bool isUseUnconditional(const Loop *L) const;
void print(raw_ostream &OS) const;
void dump() const;
@@ -1659,12 +1658,6 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
return !L->contains(UserInst);
}
-/// Test whether this fixup is for an instruction that's unconditional, i.e.
-/// it's executed in every loop iteration.
-bool LSRFixup::isUseUnconditional(const Loop *L) const {
- return isGuaranteedToExecuteForEveryIteration(UserInst, L);
-}
-
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void LSRFixup::print(raw_ostream &OS) const {
OS << "UserInst=";
@@ -2234,6 +2227,7 @@ class LSRInstance {
void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);
bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+ bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
void CollectLoopInvariantFixupsAndFormulae();
@@ -3628,7 +3622,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
LF.PostIncLoops = TmpPostIncLoops;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
- LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);
+ LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
// Create SCEV as Formula for calculating baseline cost
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
@@ -3702,6 +3696,14 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
return true;
}
+/// Test whether this fixup will be executed each time the corresponding IV
+/// increment instruction is executed.
+bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
+ // If the fixup block dominates the IV increment block then there is no path
+ // through the loop to the increment that doesn't pass through the fixup.
+ return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
+}
+
/// Check for other uses of loop-invariant values which we're tracking. These
/// other uses will pin these values in registers, making them less profitable
/// for elimination.
@@ -3825,7 +3827,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
LF.OperandValToReplace = U;
LF.Offset = Offset;
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
- LU.AllFixupsUnconditional &= LF.isUseUnconditional(L);
+ LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
if (!LU.WidestFixupType ||
SE.getTypeSizeInBits(LU.WidestFixupType) <
SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
index 065a6c8b980f8..1944a9c800355 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
@@ -174,3 +174,141 @@ for.inc:
exit:
ret i32 %ret.next
}
+
+; We can use postindex addressing for both loads here, even though the second
+; may not be executed on every loop iteration.
+define i32 @early_exit_load(ptr %p, ptr %q, ptr %n) {
+; CHECK-LABEL: define i32 @early_exit_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT: [[RET_PHI:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[PVAL]], 0
+; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_INC]], label %[[EXIT:.*]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT: [[ADD]] = add nsw i32 [[QVAL]], [[RET_PHI]]
+; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
+; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
+; CHECK-NEXT: br i1 [[CMP2]], label %[[FOR_BODY]], label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_PHI]], %[[FOR_BODY]] ], [ [[ADD]], %[[FOR_INC]] ]
+; CHECK-NEXT: ret i32 [[RET]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %ret.phi = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+ %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+ %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+ %pval = load i32, ptr %paddr, align 4
+ %cmp1 = icmp eq i32 %pval, 0
+ br i1 %cmp1, label %for.inc, label %exit
+
+for.inc:
+ %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+ %qval = load i32, ptr %qaddr, align 4
+ %add = add nsw i32 %qval, %ret.phi
+ %idx.next = add nuw nsw i64 %idx, 1
+ %nval = load volatile i64, ptr %n, align 8
+ %cmp2 = icmp slt i64 %idx.next, %nval
+ br i1 %cmp2, label %for.body, label %exit
+
+exit:
+ %ret = phi i32 [ %ret.phi, %for.body ], [ %add, %for.inc ]
+ ret i32 %ret
+}
+
+; The control-flow before and after the load of qval shouldn't prevent postindex
+; addressing from happening.
+; FIXME: We choose postindex addressing, but the scevgep is placed in for.inc so
+; during codegen we will fail to actually generate a postindex load.
+define void @middle_block_load(ptr %p, ptr %q, i64 %n) {
+; CHECK-LABEL: define void @middle_block_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_INC]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV2]], align 4
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[PVAL]], 0
+; CHECK-NEXT: [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4
+; CHECK-NEXT: br i1 [[CMP1]], label %[[IF_THEN1:.*]], label %[[IF_ELSE1:.*]]
+; CHECK: [[IF_THEN1]]:
+; CHECK-NEXT: tail call void @otherfn1()
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_ELSE1]]:
+; CHECK-NEXT: tail call void @otherfn2()
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[QVAL]], 0
+; CHECK-NEXT: br i1 [[CMP2]], label %[[IF_THEN2:.*]], label %[[IF_ELSE2:.*]]
+; CHECK: [[IF_THEN2]]:
+; CHECK-NEXT: tail call void @otherfn1()
+; CHECK-NEXT: br label %[[FOR_INC]]
+; CHECK: [[IF_ELSE2]]:
+; CHECK-NEXT: tail call void @otherfn2()
+; CHECK-NEXT: br label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[CMP3]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+ %paddr = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+ %pval = load i32, ptr %paddr, align 4
+ %cmp1 = icmp sgt i32 %pval, 0
+ br i1 %cmp1, label %if.then1, label %if.else1
+
+if.then1:
+ tail call void @otherfn1()
+ br label %if.end
+
+if.else1:
+ tail call void @otherfn2()
+ br label %if.end
+
+if.end:
+ %qaddr = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+ %qval = load i32, ptr %qaddr, align 4
+ %cmp2 = icmp sgt i32 %qval, 0
+ br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:
+ tail call void @otherfn1()
+ br label %for.inc
+
+if.else2:
+ tail call void @otherfn2()
+ br label %for.inc
+
+for.inc:
+ %idx.next = add nuw nsw i64 %idx, 1
+ %cmp3 = icmp eq i64 %idx.next, %n
+ br i1 %cmp3, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+declare dso_local void @otherfn1()
+declare dso_local void @otherfn2()
More information about the llvm-commits
mailing list