[llvm] 0e73bbd - [AMDGPU][PromoteAlloca] Don't stop when an alloca is too big to promote (#93466)

Mon May 27 23:05:54 PDT 2024

Author: Pierre van Houtryve
Date: 2024-05-28T08:05:50+02:00
New Revision: 0e73bbd3450cca0bb383335fae4120f08da5be7b

URL: https://github.com/llvm/llvm-project/commit/0e73bbd3450cca0bb383335fae4120f08da5be7b
DIFF: https://github.com/llvm/llvm-project/commit/0e73bbd3450cca0bb383335fae4120f08da5be7b.diff

LOG: [AMDGPU][PromoteAlloca] Don't stop when an alloca is too big to promote (#93466)

When I rewrote this, I made a mistake in the control flow. I thought we
could just stop promoting if an alloca is too big to vectorize, but we
can't. Other allocas in the list may be promotable and fit within the
budget.

Fixes SWDEV-455343

Added: 
    llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index c0846b123d187..33474e7de0188 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -333,22 +333,26 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
     const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
-    if (AllocaCost > VectorizationBudget) {
-      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
-                        << "\n");
-      return Changed;
+    // First, check if we have enough budget to vectorize this alloca.
+    if (AllocaCost <= VectorizationBudget) {
+      // If we do, attempt vectorization, otherwise, fall through and try
+      // promoting to LDS instead.
+      if (tryPromoteAllocaToVector(*AI)) {
+        Changed = true;
+        assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+               "Underflow!");
+        VectorizationBudget -= AllocaCost;
+        LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                          << VectorizationBudget << "\n");
+        continue;
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+                        << AllocaCost << ", budget:" << VectorizationBudget
+                        << "): " << *AI << "\n");
     }
 
-    if (tryPromoteAllocaToVector(*AI)) {
-      Changed = true;
-      assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
-             "Underflow!");
-      VectorizationBudget -= AllocaCost;
-      LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
-                        << VectorizationBudget << "\n");
-      if (VectorizationBudget == 0)
-        break;
-    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+    if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
new file mode 100644
index 0000000000000..e13ab421dfdb7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-budget-exhausted.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca-to-vector-limit=128 -passes=amdgpu-promote-alloca-to-vector %s -o - | FileCheck %s
+
+; Check that when we see an alloca that's too big to vectorize given the remaining budget,
+; we don't give up and we keep looking for other allocas to vectorize.
+
+define amdgpu_kernel void @simple_users_scores() {
+; CHECK-LABEL: define amdgpu_kernel void @simple_users_scores(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
+; CHECK-NEXT:    [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
+; CHECK-NEXT:    [[V0_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT:    store i64 [[V0_EXT]], ptr addrspace(5) [[MANYUSERS_1]], align 8
+; CHECK-NEXT:    [[MANYUSERS_2:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 1
+; CHECK-NEXT:    [[V1:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_2]], align 1
+; CHECK-NEXT:    [[V1_EXT:%.*]] = zext i8 [[V0]] to i64
+; CHECK-NEXT:    store i64 [[V1_EXT]], ptr addrspace(5) [[MANYUSERS_2]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4 and be visited first.
+  %manyusers = alloca [64 x i64], align 4, addrspace(5)
+
+  store i64 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i64
+  store i64 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i64
+  store i64 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}