[llvm] [LSR] Add an addressing mode that considers all addressing modes (PR #158110)

Thu Sep 11 09:29:39 PDT 2025

https://github.com/john-brawn-arm created https://github.com/llvm/llvm-project/pull/158110

The way that loops strength reduction works is that the target has to upfront decide whether it wants its addressing to be preindex, postindex, or neither. This choice affects:
 * Which potential solutions we generate
 * Whether we consider a pre/post index load/store as costing an AddRec or not.

None of these choices are a good fit for either AArch64 or ARM, where both preindex and postindex addressing are typically free:
 * If we pick None then we count pre/post index addressing as costing one addrec more than is correct so we don't pick them when we should.
 * If we pick PreIndexed or PostIndexed then we get the correct cost for that addressing type, but still get it wrong for the other and also exclude potential solutions using offset addressing that could have less cost.

This patch adds an "all" addressing mode that causes all potential solutions to be generated and counts both pre and postindex as having AddRecCost of zero. Unfortuntely this reveals problems elsewhere in how we calculate the cost of things that need to be fixed before we can make use of it.

>From 3432dd04d0a711c22b2c36a70f708248dc159c39 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Fri, 5 Sep 2025 17:26:55 +0100
Subject: [PATCH] [LSR] Add an addressing mode that considers all addressing
 modes

The way that loops strength reduction works is that the target has to
upfront decide whether it wants its addressing to be preindex,
postindex, or neither. This choice affects:
 * Which potential solutions we generate
 * Whether we consider a pre/post index load/store as costing an
   AddRec or not.

None of these choices are a good fit for either AArch64 or ARM, where
both preindex and postindex addressing are typically free:
 * If we pick None then we count pre/post index addressing as costing
   one addrec more than is correct so we don't pick them when we
   should.
 * If we pick PreIndexed or PostIndexed then we get the correct cost
   for that addressing type, but still get it wrong for the other and
   also exclude potential solutions using offset addressing that could
   have less cost.

This patch adds an "all" addressing mode that causes all potential
solutions to be generated and counts both pre and postindex as having
AddRecCost of zero. Unfortuntely this reveals problems elsewhere in
how we calculate the cost of things that need to be fixed before we
can make use of it.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   3 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  36 ++--
 .../LoopStrengthReduce/AArch64/prefer-all.ll  | 178 ++++++++++++++++++
 3 files changed, 199 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a5e98bb7bc137..0ff1d2e634319 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -799,7 +799,8 @@ class TargetTransformInfo {
   enum AddressingModeKind {
     AMK_PreIndexed,
     AMK_PostIndexed,
-    AMK_None
+    AMK_None,
+    AMK_All
   };
 
   /// Return the preferred addressing mode LSR should make efforts to generate.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e3ef9d8680b53..4a09dc87ef682 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -167,17 +167,15 @@ static cl::opt<bool> FilterSameScaledReg(
              " with the same ScaledReg and Scale"));
 
 static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
-  "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
-   cl::desc("A flag that overrides the target's preferred addressing mode."),
-   cl::values(clEnumValN(TTI::AMK_None,
-                         "none",
-                         "Don't prefer any addressing mode"),
-              clEnumValN(TTI::AMK_PreIndexed,
-                         "preindexed",
-                         "Prefer pre-indexed addressing mode"),
-              clEnumValN(TTI::AMK_PostIndexed,
-                         "postindexed",
-                         "Prefer post-indexed addressing mode")));
+    "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
+    cl::desc("A flag that overrides the target's preferred addressing mode."),
+    cl::values(
+        clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
+        clEnumValN(TTI::AMK_PreIndexed, "preindexed",
+                   "Prefer pre-indexed addressing mode"),
+        clEnumValN(TTI::AMK_PostIndexed, "postindexed",
+                   "Prefer post-indexed addressing mode"),
+        clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
 
 static cl::opt<unsigned> ComplexityLimit(
   "lsr-complexity-limit", cl::Hidden,
@@ -1404,7 +1402,8 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
     // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
-      if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
+      if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed &&
+          AMK != TTI::AMK_All)
         return;
 
       // It is bad to allow LSR for current loop to add induction variables
@@ -1427,10 +1426,11 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
       if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
         // If the step size matches the base offset, we could use pre-indexed
         // addressing.
-        if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
+        if (((AMK == TTI::AMK_PreIndexed || AMK == TTI::AMK_All) &&
+             F.BaseOffset.isFixed() &&
              Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
-            (AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
-             SE->isLoopInvariant(Start, L)))
+            ((AMK == TTI::AMK_PostIndexed || AMK == TTI::AMK_All) &&
+             !isa<SCEVConstant>(Start) && SE->isLoopInvariant(Start, L)))
           LoopCost = 0;
     }
     // If the loop counts down to zero and we'll be using a hardware loop then
@@ -4147,7 +4147,8 @@ void LSRInstance::GenerateConstantOffsetsImpl(
   // means that a single pre-indexed access can be generated to become the new
   // base pointer for each iteration of the loop, resulting in no extra add/sub
   // instructions for pointer updating.
-  if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
+  if ((AMK == TTI::AMK_PreIndexed || AMK == TTI::AMK_All) &&
+      LU.Kind == LSRUse::Address) {
     const APInt *StepInt;
     if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
       int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
@@ -5437,7 +5438,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // This can sometimes (notably when trying to favour postinc) lead to
     // sub-optimial decisions. There it is best left to the cost modelling to
     // get correct.
-    if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
+    if ((AMK != TTI::AMK_PostIndexed && AMK != TTI::AMK_All) ||
+        LU.Kind != LSRUse::Address) {
       int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
       for (const SCEV *Reg : ReqRegs) {
         if ((F.ScaledReg && F.ScaledReg == Reg) ||
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
new file mode 100644
index 0000000000000..db30fd23b0c9d
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/prefer-all.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=aarch64-none-elf -loop-reduce -lsr-preferred-addressing-mode=all < %s | FileCheck %s
+
+define i32 @postindex_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i32 @postindex_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[RET]], [[VAL]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
+  %ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %ret, %val
+  %idx.next = add nuw nsw i64 %idx, 1
+  %exitcond = icmp eq i64 %idx.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}
+
+; Preindex saves a setup instruction compared to postindex
+; FIXME: We currently don't recognize that preindex is possible here
+define i32 @preindex_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i32 @preindex_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[RET]], [[VAL]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
+  %ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %idx.next = add nuw nsw i64 %idx, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.next
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %ret, %val
+  %exitcond = icmp eq i64 %idx.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}
+
+; We should use offset addressing here as postindex uses an extra register.
+; FIXME: We currently use postindex as we don't realize the load of val2 is also
+; a use of p that needs it to be live in the loop.
+define i64 @offset_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: define i64 @offset_loop(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IDX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[VAL1:%.*]] = load i64, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[P]], i64 [[VAL1]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD]] = add i64 [[VAL2]], [[RET]]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], [[VAL1]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i64 [[ADD]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %ret = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds nuw i64, ptr %p, i64 %idx
+  %val1 = load i64, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds nuw i64, ptr %p, i64 %val1
+  %val2 = load i64, ptr %arrayidx2, align 4
+  %add = add i64 %val2, %ret
+  %idx.next = add nuw nsw i64 %idx, 1
+  %cmp = icmp eq i64 %idx.next, %val1
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i64 %add
+}
+
+; We can't use postindex addressing on the conditional load of qval and can't
+; convert the loop condition to a compare with zero, so we should instead use
+; offset addressing.
+; FIXME: Currently we don't notice the load of qval is conditional, and attempt
+; postindex addressing anyway.
+define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
+; CHECK-LABEL: define i32 @conditional_load(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[PVAL]], 0
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[RET_NEXT]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
+  %ret = phi i32 [ %ret.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  %pval = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %pval, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds nuw i32, ptr %q, i64 %idx
+  %qval = load i32, ptr %arrayidx1, align 4
+  %add = add i32 %ret, %qval
+  br label %for.inc
+
+for.inc:
+  %ret.next = phi i32 [ %add, %if.then ], [ %ret, %for.body ]
+  %idx.next = add nuw nsw i64 %idx, 1
+  %nval = load volatile i64, ptr %n, align 8
+  %cmp = icmp slt i64 %idx.next, %nval
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret i32 %ret.next
+}