[llvm] [AMDGPU][PromoteAlloca] Don't stop when an alloca is too big to promote (PR #93466)

Mon May 27 05:52:48 PDT 2024

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/93466

>From 7de93557b909c49927de217671d2414c88f66b28 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 27 May 2024 14:44:45 +0200
Subject: [PATCH 1/2] [AMDGPU][PromoteAlloca] Don't stop when an alloca is too
 big to promote

When I rewrote this, I made a mistake in the control flow. I thought we could just stop promoting if an alloca is too big to vectorize, but we can't. Other allocas in the list may be promotable and fit within the budget;

Fixes SWDEV-455343
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 30 +++++++-------
 .../AMDGPU/promote-alloca-budget-exhausted.ll | 41 +++++++++++++++++++
 2 files changed, 57 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index c0846b123d187..fbda8f973db99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -333,22 +333,24 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
     const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
-    if (AllocaCost > VectorizationBudget) {
-      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
-                        << "\n");
-      return Changed;
+    // First, check if we have enough budget to vectorize this alloca.
+    if (AllocaCost <= VectorizationBudget) {
+      // If we do, attempt vectorization, otherwise, fall through and try
+      // promoting to LDS instead.
+      if (tryPromoteAllocaToVector(*AI)) {
+        Changed = true;
+        assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+               "Underflow!");
+        VectorizationBudget -= AllocaCost;
+        LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                          << VectorizationBudget << "\n");
+        continue;
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" << AllocaCost << ", budget:" << VectorizationBudget << "): " << *AI << "\n");
     }
 
-    if (tryPromoteAllocaToVector(*AI)) {
-      Changed = true;
-      assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
-             "Underflow!");
-      VectorizationBudget -= AllocaCost;
-      LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
-                        << VectorizationBudget << "\n");
-      if (VectorizationBudget == 0)
-        break;
-    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+    if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
new file mode 100644
index 0000000000000..e13ab421dfdb7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca-to-vector-limit=128 -passes=amdgpu-promote-alloca-to-vector %s -o - | FileCheck %s
+
+; Check that when we see an alloca that's too big to vectorize given the remaining budget,
+; we don't give up and we keep looking for other allocas to vectorize.
+
+define amdgpu_kernel void @simple_users_scores() {
+; CHECK-LABEL: define amdgpu_kernel void @simple_users_scores(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
+; CHECK-NEXT:    [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
+; CHECK-NEXT:    [[V0_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT:    store i64 [[V0_EXT]], ptr addrspace(5) [[MANYUSERS_1]], align 8
+; CHECK-NEXT:    [[MANYUSERS_2:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 1
+; CHECK-NEXT:    [[V1:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_2]], align 1
+; CHECK-NEXT:    [[V1_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT:    store i64 [[V1_EXT]], ptr addrspace(5) [[MANYUSERS_2]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4 and be visited first.
+  %manyusers = alloca [64 x i64], align 4, addrspace(5)
+
+  store i64 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i64
+  store i64 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i64
+  store i64 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}

>From 4b43b2d448ab5ce87436c87d4aff92f54c136961 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 27 May 2024 14:52:38 +0200
Subject: [PATCH 2/2] clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fbda8f973db99..33474e7de0188 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -347,7 +347,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
         continue;
       }
     } else {
-      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" << AllocaCost << ", budget:" << VectorizationBudget << "): " << *AI << "\n");
+      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+                        << AllocaCost << ", budget:" << VectorizationBudget
+                        << "): " << *AI << "\n");
     }
 
     if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))