[llvm] [AArch64] Allow forcing unrolling of small loops (PR #167488)
Vladi Krapp via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 11 03:04:44 PST 2025
https://github.com/VladiKrapp-Arm created https://github.com/llvm/llvm-project/pull/167488
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s cost is below this value we set UP.Force = true (default 0 keeps current behaviour)
- Add an AArch64 loop-unroll regression test that runs once at the default threshold and once with the flag raised, confirming forced unrolling
>From 27299970ec72852896ccd57a17fd796a522f695f Mon Sep 17 00:00:00 2001
From: Vladi Krapp <vladi.krapp at arm.com>
Date: Mon, 10 Nov 2025 13:23:49 +0000
Subject: [PATCH] [AArch64] Allow forcing unrolling of small loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s
cost is below this value we set UP.Force = true (default 0 keeps
current behaviour)
- Add an AArch64 loop-unroll regression test that runs once at the
default threshold and once with the flag raised, confirming forced
unrolling
---
.../AArch64/AArch64TargetTransformInfo.cpp | 14 +++
.../AArch64/force-unroll-threshold.ll | 90 +++++++++++++++++++
2 files changed, 104 insertions(+)
create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..79ad532f73efc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));
+static cl::opt<int> Aarch64ForceUnrollThreshold(
+ "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Threshold for forced unrolling of small loops in AArch64"));
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -5250,6 +5254,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
// inlining. Don't unroll auto-vectorized loops either, though do allow
// unrolling of the scalar remainder.
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
+ InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5269,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
continue;
return;
}
+
+ SmallVector<const Value*, 4> Operands(I.operand_values());
+ Cost += getInstructionCost(&I, Operands,
+ TargetTransformInfo::TCK_SizeAndLatency);
}
}
@@ -5310,6 +5319,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
}
+
+ // Force unrolling small loops can be very useful because of the branch
+ // taken cost of the backedge.
+ if (Cost < Aarch64ForceUnrollThreshold)
+ UP.Force = true;
}
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
new file mode 100644
index 0000000000000..986df8bed8462
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
+; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
+
+; The loop has a small runtime upper bound (at most four iterations) but a
+; relatively expensive body. With runtime unrolling enabled, the cost model
+; still leaves the loop rolled. Raising the AArch64 force threshold overrides
+; that decision and unrolls.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
+entry:
+ br label %loop
+
+; NOFORCE-LABEL: @force_small_loop(
+; NOFORCE: loop:
+; NOFORCE: br i1 %cond, label %body, label %exit
+; NOFORCE: body:
+; NOFORCE: store i32 %mix15, ptr %ptrb, align 4
+; NOFORCE: latch:
+; NOFORCE: br i1 %cmp2, label %loop, label %exit
+; NOFORCE: ret void
+; NOFORCE-NOT: loop.1:
+;
+; FORCE-LABEL: @force_small_loop(
+; FORCE: loop:
+; FORCE: br i1 %cond, label %body, label %exit
+; FORCE: loop.1:
+; FORCE: br i1 true, label %body.1, label %exit
+; FORCE: body.1:
+; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4
+; FORCE: latch.1:
+; FORCE: br i1 %cmp2.1, label %loop, label %exit
+; FORCE: ret void
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+ %ptra = getelementptr inbounds i32, ptr %a, i32 %i
+ %pa = load i32, ptr %ptra, align 4
+ %tmp0 = mul nsw i32 %pa, %pa
+ %tmp1 = add nsw i32 %tmp0, %pa
+ %tmp2 = shl i32 %tmp1, 1
+ %tmp3 = ashr i32 %tmp2, 1
+ %tmp4 = xor i32 %tmp3, %pa
+ %tmp5 = add nsw i32 %tmp4, 7
+ %tmp6 = mul nsw i32 %tmp5, 5
+ %tmp7 = add nsw i32 %tmp6, %tmp4
+ %tmp8 = mul nsw i32 %tmp7, %tmp3
+ %tmp9 = add nsw i32 %tmp8, %tmp7
+ %tmp10 = xor i32 %tmp9, %tmp6
+ %tmp11 = add nsw i32 %tmp10, %tmp8
+ %tmp12 = mul nsw i32 %tmp11, 9
+ %tmp13 = add nsw i32 %tmp12, %tmp10
+ %tmp14 = xor i32 %tmp13, %tmp11
+ %cond = icmp ult i32 %i, %n
+ br i1 %cond, label %body, label %exit
+
+body:
+ %ptrb = getelementptr inbounds i32, ptr %b, i32 %i
+ %pb = load i32, ptr %ptrb, align 4
+ %sum = add nsw i32 %pb, %tmp14
+ %diff = sub nsw i32 %sum, %pa
+ %mix1 = mul nsw i32 %diff, 3
+ %mix2 = add nsw i32 %mix1, %tmp3
+ %mix3 = xor i32 %mix2, %diff
+ %mix4 = add nsw i32 %mix3, %tmp0
+ %mix5 = mul nsw i32 %mix4, 11
+ %mix6 = add nsw i32 %mix5, %mix2
+ %mix7 = xor i32 %mix6, %mix5
+ %mix8 = add nsw i32 %mix7, %mix3
+ %mix9 = mul nsw i32 %mix8, 13
+ %mix10 = add nsw i32 %mix9, %mix8
+ %mix11 = xor i32 %mix10, %mix7
+ %mix12 = add nsw i32 %mix11, %mix6
+ %mix13 = mul nsw i32 %mix12, 17
+ %mix14 = add nsw i32 %mix13, %mix9
+ %mix15 = xor i32 %mix14, %mix10
+ store i32 %mix15, ptr %ptrb, align 4
+ br label %latch
+
+latch:
+ %inc = add nuw nsw i32 %i, 1
+ %cmp.limit = icmp ult i32 %n, 4
+ %upper = select i1 %cmp.limit, i32 %n, i32 4
+ %cmp2 = icmp ult i32 %inc, %upper
+ br i1 %cmp2, label %loop, label %exit
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list