[llvm] [AMDGPU][PromoteAlloca] Don't stop when an alloca is too big to promote (PR #93466)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Mon May 27 05:52:48 PDT 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/93466
>From 7de93557b909c49927de217671d2414c88f66b28 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 27 May 2024 14:44:45 +0200
Subject: [PATCH 1/2] [AMDGPU][PromoteAlloca] Don't stop when an alloca is too
big to promote
When I rewrote this, I made a mistake in the control flow. I thought we could just stop promoting if an alloca is too big to vectorize, but we can't. Other allocas in the list may be promotable and fit within the budget;
Fixes SWDEV-455343
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 30 +++++++-------
.../AMDGPU/promote-alloca-budget-exhausted.ll | 41 +++++++++++++++++++
2 files changed, 57 insertions(+), 14 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index c0846b123d187..fbda8f973db99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -333,22 +333,24 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool Changed = false;
for (AllocaInst *AI : Allocas) {
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
- if (AllocaCost > VectorizationBudget) {
- LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
- << "\n");
- return Changed;
+ // First, check if we have enough budget to vectorize this alloca.
+ if (AllocaCost <= VectorizationBudget) {
+ // If we do, attempt vectorization, otherwise, fall through and try
+ // promoting to LDS instead.
+ if (tryPromoteAllocaToVector(*AI)) {
+ Changed = true;
+ assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+ "Underflow!");
+ VectorizationBudget -= AllocaCost;
+ LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
+ << VectorizationBudget << "\n");
+ continue;
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" << AllocaCost << ", budget:" << VectorizationBudget << "): " << *AI << "\n");
}
- if (tryPromoteAllocaToVector(*AI)) {
- Changed = true;
- assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
- "Underflow!");
- VectorizationBudget -= AllocaCost;
- LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
- << VectorizationBudget << "\n");
- if (VectorizationBudget == 0)
- break;
- } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+ if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
new file mode 100644
index 0000000000000..e13ab421dfdb7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca-to-vector-limit=128 -passes=amdgpu-promote-alloca-to-vector %s -o - | FileCheck %s
+
+; Check that when we see an alloca that's too big to vectorize given the remaining budget,
+; we don't give up and we keep looking for other allocas to vectorize.
+
+define amdgpu_kernel void @simple_users_scores() {
+; CHECK-LABEL: define amdgpu_kernel void @simple_users_scores(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
+; CHECK-NEXT: [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
+; CHECK-NEXT: [[V0_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT: store i64 [[V0_EXT]], ptr addrspace(5) [[MANYUSERS_1]], align 8
+; CHECK-NEXT: [[MANYUSERS_2:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 1
+; CHECK-NEXT: [[V1:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_2]], align 1
+; CHECK-NEXT: [[V1_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT: store i64 [[V1_EXT]], ptr addrspace(5) [[MANYUSERS_2]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ ; should get a score of 1
+ %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+ ; should get a score of 4 and be visited first.
+ %manyusers = alloca [64 x i64], align 4, addrspace(5)
+
+ store i64 42, ptr addrspace(5) %simpleuser
+
+ %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+ %v0 = load i8, ptr addrspace(5) %manyusers.1
+ %v0.ext = zext i8 %v0 to i64
+ store i64 %v0.ext, ptr addrspace(5) %manyusers.1
+
+ %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+ %v1 = load i8, ptr addrspace(5) %manyusers.2
+ %v1.ext = zext i8 %v0 to i64
+ store i64 %v1.ext, ptr addrspace(5) %manyusers.2
+
+ ret void
+}
>From 4b43b2d448ab5ce87436c87d4aff92f54c136961 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 27 May 2024 14:52:38 +0200
Subject: [PATCH 2/2] clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fbda8f973db99..33474e7de0188 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -347,7 +347,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
continue;
}
} else {
- LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" << AllocaCost << ", budget:" << VectorizationBudget << "): " << *AI << "\n");
+ LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+ << AllocaCost << ", budget:" << VectorizationBudget
+ << "): " << *AI << "\n");
}
if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
More information about the llvm-commits
mailing list