[llvm] [AMDGPU] Fix PromoteAlloca size check of alloca for store (PR #72528)

Fri Nov 17 15:41:51 PST 2023

https://github.com/bcahoon updated https://github.com/llvm/llvm-project/pull/72528

>From 833ed7ea953ec34cef070391013b9fe671d1c626 Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <brendon.cahoon at amd.com>
Date: Wed, 15 Nov 2023 18:37:10 -0600
Subject: [PATCH 1/2] [AMDGPU] Fix PromoteAlloca size check of alloca for store

When storing a subvector too many element were written when the
size of the alloca is smaller than the size of the vector store.
This patch checks for the minimum of the alloca vector and the
store vector to determine the number of elements to store.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  4 +++-
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 20 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 29591ddd669c95e..9293f8954cfe2b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -469,6 +469,7 @@ static Value *promoteAllocaUserToVector(
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
       const unsigned NumWrittenElts =
           AccessSize / DL.getTypeStoreSize(VecEltTy);
+      const unsigned NumVecElts = VectorTy->getNumElements();
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
@@ -480,7 +481,8 @@ static Value *promoteAllocaUserToVector(
       Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
 
       Value *CurVec = GetOrLoadCurrentVectorValue();
-      for (unsigned K = 0; K < NumWrittenElts; ++K) {
+      for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
+           K < NumElts; ++K) {
         Value *CurIdx =
             Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
         CurVec = Builder.CreateInsertElement(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 765aa250a48f4f0..d8e228e27a8360c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -458,3 +458,23 @@ finally:
   %load = load <4 x i16>, ptr addrspace(5) %ptr.2, align 2
   ret <4 x i16> %load
 }
+
+
+; Check the case when the alloca is smaller than the vector size.
+define void @math_kernel3(<4 x i32> %store) {
+; CHECK-LABEL: define void @math_kernel3
+; CHECK-SAME: (<4 x i32> [[STORE:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[STORE]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[STORE]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[STORE]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %res = alloca <3 x i32>, align 16, addrspace(5)
+  store <4 x i32> %store, ptr addrspace(5) %res, align 16
+  ret void
+}

>From b807981d4dd4f0e70f243df6065c5d3489dc33fd Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <brendon.cahoon at amd.com>
Date: Fri, 17 Nov 2023 17:31:49 -0600
Subject: [PATCH 2/2] fixup! [AMDGPU] Fix PromoteAlloca size check of alloca
 for store

---
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index d8e228e27a8360c..7c5410004ed5b75 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -461,20 +461,27 @@ finally:
 
 
 ; Check the case when the alloca is smaller than the vector size.
-define void @math_kernel3(<4 x i32> %store) {
-; CHECK-LABEL: define void @math_kernel3
-; CHECK-SAME: (<4 x i32> [[STORE:%.*]]) {
+define void @test_smaller_alloca_store(<4 x i32> %store1, <4 x i32> %store2) {
+; CHECK-LABEL: define void @test_smaller_alloca_store
+; CHECK-SAME: (<4 x i32> [[STORE1:%.*]], <4 x i32> [[STORE2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[STORE]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[STORE1]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[STORE]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[STORE1]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[STORE]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[STORE1]], i64 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[STORE2]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STORE2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STORE2]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x i32> [[TMP9]], i32 [[TMP10]], i32 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %res = alloca <3 x i32>, align 16, addrspace(5)
-  store <4 x i32> %store, ptr addrspace(5) %res, align 16
+  store <4 x i32> %store1, ptr addrspace(5) %res, align 16
+  store <4 x i32> %store2, ptr addrspace(5) %res, align 16
   ret void
 }