[llvm] Initial commit (PR #184546)
Justin Fargnoli via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 21:08:22 PST 2026
https://github.com/justinfargnoli updated https://github.com/llvm/llvm-project/pull/184546
>From f8280acade69922938a4af2537f30a3a37e49657 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 4 Mar 2026 05:08:10 +0000
Subject: [PATCH] Initial commit
---
.../llvm/Analysis/TargetTransformInfo.h | 4 +
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 1 +
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 34 +++++++-
.../NVPTX/loop-dependent-local-array.ll | 84 +++++++++++++++++++
4 files changed, 122 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/LoopUnroll/NVPTX/loop-dependent-local-array.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 18ae6a005d972..37aa78fe120b6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -732,6 +732,10 @@ class TargetTransformInfo {
bool RuntimeUnrollMultiExit;
/// Allow unrolling to add parallel reduction phis.
bool AddAdditionalAccumulators;
+ /// Multiply the full-unroll Threshold when the loop contains
+ /// loop-dependent accesses to alloca-backed arrays whose
+ /// elimination would benefit from full unrolling. Default 1 (no boost).
+ unsigned LoopDependentMemoryAccessThresholdMultiplier = 1;
};
/// Get target-customized preferences for the generic loop unrolling
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index c1fe9300785a3..a4239982b3172 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -537,6 +537,7 @@ void NVPTXTTIImpl::getUnrollingPreferences(
// beneficial.
UP.Partial = UP.Runtime = true;
UP.PartialThreshold = UP.Threshold / 4;
+ UP.LoopDependentMemoryAccessThresholdMultiplier = 8;
}
void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 3e2ed34b3c67d..6dbba163bd1ad 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
@@ -843,6 +844,34 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
return std::nullopt;
}
+/// Return true if \p L contains a load or store to an alloca whose address
+/// is loop-dependent. Full-unrolling such loops can eliminate the alloca
+/// entirely once all constant-index accesses are visible to SROA.
+static bool hasLoopDependentArrayAccess(const Loop *L, ScalarEvolution &SE) {
+ for (BasicBlock *BB : L->blocks()) {
+ for (Instruction &I : *BB) {
+ Value *Ptr = nullptr;
+ if (auto *LI = dyn_cast<LoadInst>(&I))
+ Ptr = LI->getPointerOperand();
+ else if (auto *SI = dyn_cast<StoreInst>(&I))
+ Ptr = SI->getPointerOperand();
+ else
+ continue;
+
+ SmallVector<const Value *, 4> Objects;
+ getUnderlyingObjects(Ptr, Objects, /*LI=*/nullptr, /*MaxLookup=*/10);
+ for (const Value *Obj : Objects) {
+ if (!isa<AllocaInst>(Obj))
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.getLoopDisposition(PtrSCEV, L) != ScalarEvolution::LoopInvariant)
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
static std::optional<unsigned> shouldFullUnroll(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
@@ -855,7 +884,10 @@ static std::optional<unsigned> shouldFullUnroll(
// When computing the unrolled size, note that BEInsns are not replicated
// like the rest of the loop body.
- if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
+ unsigned Threshold = UP.Threshold;
+ if (hasLoopDependentArrayAccess(L, SE))
+ Threshold *= UP.LoopDependentMemoryAccessThresholdMultiplier;
+ if (UCE.getUnrolledLoopSize(UP) < Threshold)
return FullUnrollTripCount;
// The loop isn't that small, but we still can fully unroll it if that
diff --git a/llvm/test/Transforms/LoopUnroll/NVPTX/loop-dependent-local-array.ll b/llvm/test/Transforms/LoopUnroll/NVPTX/loop-dependent-local-array.ll
new file mode 100644
index 0000000000000..4b04eae8f9a5c
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/NVPTX/loop-dependent-local-array.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -mtriple=nvptx64 -passes=loop-unroll -unroll-threshold=10 \
+; RUN: -unroll-partial-threshold=0 -unroll-max-iteration-count-to-analyze=0 \
+; RUN: -S | FileCheck %s
+
+; A loop with an IV-dependent store to an alloca should be fully unrolled
+; because NVPTX sets LoopDependentMemoryAccessThresholdMultiplier = 8,
+; boosting the effective threshold from 10 to 80.
+; Loop size is 5, trip count is 4, so unrolled size = 4*5 - 3*2 = 14 > 10 but < 80.
+;
+define void @iv_dependent_alloca(ptr %out) {
+; CHECK-LABEL: define void @iv_dependent_alloca(
+; CHECK-SAME: ptr [[OUT:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ARR:%.*]] = alloca [4 x i32], align 4
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: store i32 0, ptr [[ARR]], align 4
+; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i32 0, i32 1
+; CHECK-NEXT: store i32 1, ptr [[PTR_1]], align 4
+; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i32 0, i32 2
+; CHECK-NEXT: store i32 2, ptr [[PTR_2]], align 4
+; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds [4 x i32], ptr [[ARR]], i32 0, i32 3
+; CHECK-NEXT: store i32 3, ptr [[PTR_3]], align 4
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARR]], align 4
+; CHECK-NEXT: store i32 [[VAL]], ptr [[OUT]], alidgn 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %arr = alloca [4 x i32], align 4
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %ptr = getelementptr inbounds [4 x i32], ptr %arr, i32 0, i32 %iv
+ store i32 %iv, ptr %ptr, align 4
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, 4
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ %val = load i32, ptr %arr, align 4
+ store i32 %val, ptr %out, align 4
+ ret void
+}
+
+; A loop without any alloca access should NOT get the threshold boost.
+; Loop size is 8, trip count is 4, unrolled size = 4*8 - 3*2 = 26 > threshold 10.
+;
+define void @no_alloca(ptr %src, ptr %dst) {
+; CHECK-LABEL: define void @no_alloca(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[IDX:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]]
+; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP2]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 4
+; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %idx = sext i32 %iv to i64
+ %gep = getelementptr inbounds i32, ptr %src, i64 %idx
+ %val = load i32, ptr %gep, align 4
+ %gep2 = getelementptr inbounds i32, ptr %dst, i64 %idx
+ store i32 %val, ptr %gep2, align 4
+ %iv.next = add nuw nsw i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, 4
+ br i1 %exitcond, label %exit, label %loop
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list