[llvm] 310d32c - [AMDGPU] Fix promote alloca which is already vector

Mon May 11 14:52:45 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-05-11T14:52:31-07:00
New Revision: 310d32cb80a611e6384a921e85607fea05841f26

URL: https://github.com/llvm/llvm-project/commit/310d32cb80a611e6384a921e85607fea05841f26
DIFF: https://github.com/llvm/llvm-project/commit/310d32cb80a611e6384a921e85607fea05841f26.diff

LOG: [AMDGPU] Fix promote alloca which is already vector

Just do not touch loads and stores which are already vector.
Previously pass was just unable to see these loads and stores
because these were hidden bitcasts.

Differential Revision: https://reviews.llvm.org/D79738

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 0c71eee3a4f8..44fa99f162a9 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -468,7 +468,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AllocaTy)
+      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
         break;
 
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
@@ -486,7 +486,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy)
+      if (SI->getValueOperand()->getType() == AllocaTy ||
+          SI->getValueOperand()->getType()->isVectorTy())
         break;
 
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index b9a9850586e1..9c7b2fcb8c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -345,6 +345,110 @@ entry:
   ret void
 }
 
+; OPT-LABEL: @bitcast_vector_to_vector(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}bitcast_vector_to_vector:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:      %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
+; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
+; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
+; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
+; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+
+; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
+  store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
+  %load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
+  store [4 x i32] %load, [4 x i32] addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:      %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
+; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
+; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
+; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
+; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+
+; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+%struct.v4 = type { i32, i32, i32, i32 }
+
+define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
+  store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
+  %load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
+  store %struct.v4 %load, %struct.v4 addrspace(1)* %out
+  ret void
+}
+
 declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
 
 declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)