[llvm] [NVPTX] Add NVPTXIncreaseAligmentPass to improve vectorization (PR #144958)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 19 14:25:00 PDT 2025
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/144958
This change adds simple pass that looks at local memory arrays that are statically sized and sets an appropriate alignment for them. This enables vectorization of loads/stores to these arrays if not explicitly specified by the client.
>From 9e8242f7566040bb2e45da3a163bf5ba719e8401 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 19 Jun 2025 15:29:12 +0000
Subject: [PATCH] [NVPTX] Add NVPTXIncreaseAligmentPass to improve
vectorization
---
llvm/lib/Target/NVPTX/CMakeLists.txt | 1 +
llvm/lib/Target/NVPTX/NVPTX.h | 7 +
.../Target/NVPTX/NVPTXIncreaseAlignment.cpp | 128 ++++++++++++++++++
llvm/lib/Target/NVPTX/NVPTXPassRegistry.def | 1 +
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +
.../CodeGen/NVPTX/increase-local-align.ll | 85 ++++++++++++
6 files changed, 224 insertions(+)
create mode 100644 llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
create mode 100644 llvm/test/CodeGen/NVPTX/increase-local-align.ll
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 693f0d0b35edc..9d91100d35b3a 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources
NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
NVPTXLowerAlloca.cpp
+ NVPTXIncreaseAlignment.cpp
NVPTXLowerArgs.cpp
NVPTXLowerUnreachable.cpp
NVPTXMCExpr.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b7c5a0a5c9983..a0d5cc5fd8e87 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();
+FunctionPass *createNVPTXIncreaseLocalAlignmentPass();
void initializeNVVMReflectLegacyPassPass(PassRegistry &);
void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
@@ -76,6 +77,7 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
void initializeNVPTXPeepholePass(PassRegistry &);
void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &);
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -111,6 +113,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+struct NVPTXIncreaseLocalAlignmentPass
+ : PassInfoMixin<NVPTXIncreaseLocalAlignmentPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
namespace NVPTX {
enum DrvInterface {
NVCL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
new file mode 100644
index 0000000000000..cf6d52e0d9fd9
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
@@ -0,0 +1,128 @@
+//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple pass that looks at local memory arrays that are statically
+// sized and sets an appropriate alignment for them. This enables vectorization
+// of loads/stores to these arrays if not explicitly specified by the client.
+//
+// TODO: Ideally we should do a bin-packing of local arrays to maximize
+// alignments while minimizing holes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment",
+ cl::init(false), cl::Hidden,
+ cl::desc("Use maximum alignment for local memory"));
+
+static constexpr Align MaxPTXArrayAlignment = Align::Constant<16>();
+
+/// Get the maximum useful alignment for an array. This is more likely to
+/// produce holes in the local memory.
+///
+/// Choose an alignment large enough that the entire array could be loaded with
+/// a single vector load (if possible). Cap the alignment at MaxPTXArrayAlignment.
+static Align getAggressiveArrayAlignment(const unsigned ArraySize) {
+ return std::min(MaxPTXArrayAlignment, Align(PowerOf2Ceil(ArraySize)));
+}
+
+/// Get the alignment of arrays that reduces the chances of leaving holes when
+/// arrays are allocated within a contiguous memory buffer (like shared memory
+/// and stack). Holes are still possible before and after the array allocation.
+///
+/// Choose the largest alignment such that the array size is a multiple of the
+/// alignment. If all elements of the buffer are allocated in order of
+/// alignment (higher to lower) no holes will be left.
+static Align getConservativeArrayAlignment(const unsigned ArraySize) {
+ return commonAlignment(MaxPTXArrayAlignment, ArraySize);
+}
+
+/// Find a better alignment for local arrays
+static bool updateAllocaAlignment(const DataLayout &DL,
+ AllocaInst *Alloca) {
+ // Looking for statically sized local arrays
+ if (!Alloca->isStaticAlloca())
+ return false;
+
+ // For now, we only support array allocas
+ if (!(Alloca->isArrayAllocation() || Alloca->getAllocatedType()->isArrayTy()))
+ return false;
+
+ const auto ArraySize = Alloca->getAllocationSize(DL);
+ if (!(ArraySize && ArraySize->isFixed()))
+ return false;
+
+ const auto ArraySizeValue = ArraySize->getFixedValue();
+ const Align PreferredAlignment =
+ MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue)
+ : getConservativeArrayAlignment(ArraySizeValue);
+
+ if (PreferredAlignment > Alloca->getAlign()) {
+ Alloca->setAlignment(PreferredAlignment);
+ return true;
+ }
+
+ return false;
+}
+
+static bool runSetLocalArrayAlignment(Function &F) {
+ bool Changed = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ BasicBlock &EntryBB = F.getEntryBlock();
+ for (Instruction &I : EntryBB)
+ if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I))
+ Changed |= updateAllocaAlignment(DL, Alloca);
+
+ return Changed;
+}
+
+
+namespace {
+struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
+ static char ID;
+ NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0;
+INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass, "nvptx-increase-local-alignment",
+ "Increase alignment for statically sized alloca arrays", false,
+ false)
+
+FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
+ return new NVPTXIncreaseLocalAlignmentLegacyPass();
+}
+
+bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) {
+ return runSetLocalArrayAlignment(F);
+}
+
+PreservedAnalyses
+NVPTXIncreaseLocalAlignmentPass::run(Function &F, FunctionAnalysisManager &AM) {
+ bool Changed = runSetLocalArrayAlignment(F);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index ee37c9826012c..827cb7bba7018 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass())
#undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85d28a703a4cb..e7ad39449dc2f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -391,6 +391,8 @@ void NVPTXPassConfig::addIRPasses() {
// but EarlyCSE can do neither of them.
if (getOptLevel() != CodeGenOptLevel::None) {
addEarlyCSEOrGVNPass();
+ // Increase alignment for local arrays to improve vectorization.
+ addPass(createNVPTXIncreaseLocalAlignmentPass());
if (!DisableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
addPass(createSROAPass());
diff --git a/llvm/test/CodeGen/NVPTX/increase-local-align.ll b/llvm/test/CodeGen/NVPTX/increase-local-align.ll
new file mode 100644
index 0000000000000..605c4b5b2b77d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/increase-local-align.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT
+; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; COMMON-LABEL: define void @test1() {
+; COMMON-NEXT: [[A:%.*]] = alloca i8, align 1
+; COMMON-NEXT: ret void
+;
+ %a = alloca i8, align 1
+ ret void
+}
+
+define void @test2() {
+; DEFAULT-LABEL: define void @test2() {
+; DEFAULT-NEXT: [[A:%.*]] = alloca [63 x i8], align 1
+; DEFAULT-NEXT: ret void
+;
+; MAX-LABEL: define void @test2() {
+; MAX-NEXT: [[A:%.*]] = alloca [63 x i8], align 16
+; MAX-NEXT: ret void
+;
+ %a = alloca [63 x i8], align 1
+ ret void
+}
+
+define void @test3() {
+; COMMON-LABEL: define void @test3() {
+; COMMON-NEXT: [[A:%.*]] = alloca [64 x i8], align 16
+; COMMON-NEXT: ret void
+;
+ %a = alloca [64 x i8], align 1
+ ret void
+}
+
+define void @test4() {
+; DEFAULT-LABEL: define void @test4() {
+; DEFAULT-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1
+; DEFAULT-NEXT: ret void
+;
+; MAX-LABEL: define void @test4() {
+; MAX-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16
+; MAX-NEXT: ret void
+;
+ %a = alloca i8, i32 63, align 1
+ ret void
+}
+
+define void @test5() {
+; COMMON-LABEL: define void @test5() {
+; COMMON-NEXT: [[A:%.*]] = alloca i8, i32 64, align 16
+; COMMON-NEXT: ret void
+;
+ %a = alloca i8, i32 64, align 1
+ ret void
+}
+
+define void @test6() {
+; COMMON-LABEL: define void @test6() {
+; COMMON-NEXT: [[A:%.*]] = alloca i8, align 32
+; COMMON-NEXT: ret void
+;
+ %a = alloca i8, align 32
+ ret void
+}
+
+define void @test7() {
+; COMMON-LABEL: define void @test7() {
+; COMMON-NEXT: [[A:%.*]] = alloca i32, align 2
+; COMMON-NEXT: ret void
+;
+ %a = alloca i32, align 2
+ ret void
+}
+
+define void @test8() {
+; COMMON-LABEL: define void @test8() {
+; COMMON-NEXT: [[A:%.*]] = alloca [2 x i32], align 8
+; COMMON-NEXT: ret void
+;
+ %a = alloca [2 x i32], align 2
+ ret void
+}
+
More information about the llvm-commits
mailing list