[llvm] 8ad39fb - [Attributor][FIX] Heap2Stack needs to use the alloca AS

Wed Feb 16 13:58:51 PST 2022

Author: Johannes Doerfert
Date: 2022-02-16T15:58:32-06:00
New Revision: 8ad39fbaf23893b3384cafa0f179d35dcf3c672b

URL: https://github.com/llvm/llvm-project/commit/8ad39fbaf23893b3384cafa0f179d35dcf3c672b
DIFF: https://github.com/llvm/llvm-project/commit/8ad39fbaf23893b3384cafa0f179d35dcf3c672b.diff

LOG: [Attributor][FIX] Heap2Stack needs to use the alloca AS

When we move an allocation from the heap to the stack we need to
allocate it in the alloca AS and then cast the result. This also
prevents us from inserting the alloca after the allocation call but
rather right before.

Fixes https://github.com/llvm/llvm-project/issues/53858

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
    llvm/test/Transforms/OpenMP/spmdization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 8e90ac352791..0ad64dbb4595 100644

--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -6031,13 +6032,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       else
         A.emitRemark<OptimizationRemark>(AI.CB, "HeapToStack", Remark);
 
+      const DataLayout &DL = A.getInfoCache().getDL();
       Value *Size;
       Optional<APInt> SizeAPI = getSize(A, *this, AI);
       if (SizeAPI.hasValue()) {
         Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
       } else {
         LLVMContext &Ctx = AI.CB->getContext();
-        auto &DL = A.getInfoCache().getDL();
         ObjectSizeOpts Opts;
         ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
         SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
@@ -6057,14 +6058,14 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
             max(Alignment, MaybeAlign(AlignmentAPI.getValue().getZExtValue()));
       }
 
-      unsigned AS = cast<PointerType>(AI.CB->getType())->getAddressSpace();
-      Instruction *Alloca =
-          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
-                         "", AI.CB->getNextNode());
+      // TODO: Hoist the alloca towards the function entry.
+      unsigned AS = DL.getAllocaAddrSpace();
+      Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
+                                           Size, Alignment, "", AI.CB);
 
       if (Alloca->getType() != AI.CB->getType())
-        Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
-                                 Alloca->getNextNode());
+        Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+            Alloca, AI.CB->getType(), "malloc_cast", AI.CB);
 
       auto *I8Ty = Type::getInt8Ty(F->getContext());
       auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);

diff  --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index 0f207e402759..5ee0a6892ac6 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -4,7 +4,12 @@
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
+; FIXME: amdgpu doesn't claim malloc is a thing, so the test is somewhat
+; useless except the __kmpc_alloc_shared part which now also covers the important
+; part this test was initially designed for, make sure the "is freed" check is
+; not sufficient on a GPU.
 target triple = "amdgcn-amd-amdhsa"
+target datalayout = "A5"
 
 declare noalias i8* @malloc(i64)
 
@@ -20,6 +25,7 @@ declare void @no_sync_func(i8* nocapture %p) nofree nosync willreturn
 
 declare void @nofree_func(i8* nocapture %p) nofree  nosync willreturn
 
+declare void @usei8(i8* %p)
 declare void @foo(i32* %p)
 
 declare void @foo_nounw(i32* %p) nounwind nofree
@@ -663,6 +669,43 @@ define void @test16d(i8 %v, i8** %P) {
   store i8* %1, i8** %P
   ret void
 }
+
+declare i8* @__kmpc_alloc_shared(i64)
+declare void @__kmpc_free_shared(i8* nocapture, i64)
+
+define void @test17() {
+; IS________OPM-LABEL: define {{[^@]+}}@test17() {
+; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
+; IS________OPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
+; IS________OPM-NEXT:    tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP1]], i64 noundef 4)
+; IS________OPM-NEXT:    ret void
+;
+; IS________NPM-LABEL: define {{[^@]+}}@test17() {
+; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1, addrspace(5)
+; IS________NPM-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP1]] to i8*
+; IS________NPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6:[0-9]+]]
+; IS________NPM-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
+  tail call void @usei8(i8* nocapture nofree %1) willreturn nounwind nosync
+  tail call void @__kmpc_free_shared(i8* %1, i64 4)
+  ret void
+}
+
+define void @test17b() {
+; CHECK-LABEL: define {{[^@]+}}@test17b() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
+; CHECK-NEXT:    tail call void @usei8(i8* nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    tail call void @__kmpc_free_shared(i8* nocapture [[TMP1]], i64 noundef 4)
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
+  tail call void @usei8(i8* nofree %1) willreturn nounwind nosync
+  tail call void @__kmpc_free_shared(i8* %1, i64 4)
+  ret void
+}
+
+
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nosync willreturn }
@@ -670,4 +713,5 @@ define void @test16d(i8 %v, i8** %P) {
 ; CHECK: attributes #[[ATTR3]] = { noreturn }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
 ; CHECK: attributes #[[ATTR5]] = { nounwind }
+; CHECK: attributes #[[ATTR6]] = { nosync nounwind willreturn }
 ;.

diff  --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 07ed024cb35b..752ccff9354a 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -678,8 +678,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
-; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
+; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
+; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
 ; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU:       for.cond:
@@ -722,8 +723,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
-; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-DISABLED-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
+; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
 ; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU-DISABLED:       for.cond: