[llvm] 15f54dd - AMDGPU: Account for usage HIP-style dynamic LDS

Wed Jan 19 10:06:18 PST 2022

Author: Yaxun (Sam) Liu
Date: 2022-01-19T13:05:29-05:00
New Revision: 15f54dd5e496bcb5ffb8e06020f2046b0ef23c76

URL: https://github.com/llvm/llvm-project/commit/15f54dd5e496bcb5ffb8e06020f2046b0ef23c76
DIFF: https://github.com/llvm/llvm-project/commit/15f54dd5e496bcb5ffb8e06020f2046b0ef23c76.diff

LOG: AMDGPU: Account for usage HIP-style dynamic LDS

Disable promote alloca to LDS when HIP-style dynamic LDS since the size
is unknown at compile time.

Patch by: Siu Chi Chan

Reviewed by: Matt Arsenault, Yaxun Liu

Differential Revision: https://reviews.llvm.org/D117494

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f9a9fe403ff6..2d8126a49327 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -789,6 +789,17 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
     Align Alignment =
         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+
+    // HIP uses an extern unsized array in local address space for dynamically
+    // allocated shared memory.  In that case, we have to disable the promotion.
+    if (GV->hasExternalLinkage() && AllocSize == 0) {
+      LocalMemLimit = 0;
+      LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
+                           "local memory. Promoting to local memory "
+                           "disabled.\n");
+      return false;
+    }
+
     AllocatedSizes.emplace_back(AllocSize, Alignment);
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
index 67e3141b4204..8ac82ffa251c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -5,6 +5,7 @@ target datalayout = "A5"
 
 @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
 @some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
+ at some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4
 
 @initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
 @initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
@@ -62,6 +63,33 @@ entry:
   ret void
 }
 
+; Has a constant expression use through a single level of constant
+; expression, but usage of dynamic LDS should block promotion
+
+; IR-LABEL: @constant_expression_uses_some_dynamic_lds(
+; IR: alloca
+
+; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds:
+; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  %gep_dyn_lds =  getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0
+  store i32 1234, i32* %gep_dyn_lds, align 4
+  ret void
+}
+
 declare void @callee(i8*)
 
 ; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
@@ -111,6 +139,29 @@ entry:
   ret void
 }
 
+; IR-LABEL: @constant_expression_uses_some_dynamic_lds_multi_level(
+; IR: alloca
+
+; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level:
+; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*))
+  ret void
+}
+
 ; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
 ; IR-NOT: alloca
 ; IR: llvm.amdgcn.workitem.id