[PATCH] D103225: [AMDGPU] Replace non-kernel function uses of LDS globals by pointers.

Wed Jun 9 08:20:36 PDT 2021

hsmhsm marked an inline comment as done.
hsmhsm added inline comments.

================
Comment at: llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll:41
+; CHECK:   %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
+; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
+; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [2 x i32] addrspace(3)*
----------------
hsmhsm wrote:
> rampitec wrote:
> > We had problems with null before. Instruction selection will happily allocate it overlapping with first non-null variable. That's how AMDGPULowerModuleLDSPass turned to use struct instead of null. Did you check we are not introducing this again in the final ISA?
> The null represent the base address of LDS memory.  My understanding is that, earlier, instruction selection had gone for toss because of the bug where we were not actually allocating memory at all for llvm.amdgcn.module.lds, but we were accessing it at address 0. And, instruction selection was allocating address 0 for some other lds (since this address was not occupied), so there was an issue. 
> 
> But, nevertheless, let me look into ISA and see if we are fine or any issue because of null.
Further, I looked into ISA, it looks good to me. For example consider below input code.

```
@lds.1 = internal addrspace(3) global i32 undef, align 4
@lds.2 = internal addrspace(3) global i64 undef, align 4

define internal void @func_uses_lds() {
entry:
  store i32 7, i32 addrspace(3)* @lds.1
  store i64 31, i64 addrspace(3)* @lds.2
  ret void
}

define protected amdgpu_kernel void @kernel_reaches_lds() {
entry:
  call void @func_uses_lds()
  ret void
}
```

output from pointer-replacement is: 

```
@lds.1 = internal addrspace(3) global i32 undef, align 4
@lds.2 = internal addrspace(3) global i64 undef, align 4
@lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
@lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2

define internal void @func_uses_lds() {
entry:
  %0 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2
  %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
  %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)*
  %3 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2
  %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3
  %5 = bitcast i8 addrspace(3)* %4 to i32 addrspace(3)*
  store i32 7, i32 addrspace(3)* %5, align 4
  store i64 31, i64 addrspace(3)* %2, align 4
  ret void
}

define protected amdgpu_kernel void @kernel_reaches_lds() {
entry:
  %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %2, label %3

2:                                                ; preds = %entry
  store i16 ptrtoint (i64 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2
  store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2
  br label %3

3:                                                ; preds = %entry, %2
  call void @llvm.amdgcn.wave.barrier()
  call void @func_uses_lds()
  ret void
}
```

output from module (kernel) lds lowering is:

```
%llvm.amdgcn.module.lds.t = type { i16, i16 }
%llvm.amdgcn.kernel.kernel_reaches_lds.lds.t = type { i64, i32 }

@llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2
@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
@llvm.amdgcn.kernel.kernel_reaches_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t undef, align 8

define internal void @func_uses_lds() {
entry:
  %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
  %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
  %2 = bitcast i8 addrspace(3)* %1 to i64 addrspace(3)*
  %3 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2
  %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3
  %5 = bitcast i8 addrspace(3)* %4 to i32 addrspace(3)*
  store i32 7, i32 addrspace(3)* %5, align 4
  store i64 31, i64 addrspace(3)* %2, align 4
  ret void
}

define protected amdgpu_kernel void @kernel_reaches_lds() {
entry:
  call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
  %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %1 = icmp eq i32 %0, 0
  br i1 %1, label %2, label %5

2:                                                ; preds = %entry
  %3 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel_reaches_lds.lds.t, %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_reaches_lds.lds, i32 0, i32 0) to i16
  store i16 %3, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
  %4 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel_reaches_lds.lds.t, %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_reaches_lds.lds, i32 0, i32 1) to i16
  store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2
  br label %5

5:                                                ; preds = %entry, %2
  call void @llvm.amdgcn.wave.barrier()
  call void @func_uses_lds()
  ret void
}
```

what it fundamentally means is: 

1.  address 0 holds the address of @lds.1, we need to load address of @lds.1 from address 0, and then we need to store value 7 to @lds.1.
2.  address 0 + 2 (offset 2) holds the address of @lds.2, we need to load address of @lds.2 from address 0 + 2, and then we need to store value 31 to @lds.2. That is what happening in the below ISA which is produced for above input ir.

```
func_uses_lds:                          ; @func_uses_lds
; %bb.0:                                ; %entry
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        v_mov_b32_e32 v1, 0
        ds_read_i16 v2, v1
        ds_read_i16 v3, v1 offset:2
        v_mov_b32_e32 v4, 7
        v_mov_b32_e32 v0, 31
        s_waitcnt lgkmcnt(1)
        ds_write_b32 v2, v4
        s_waitcnt lgkmcnt(1)
        ds_write_b64 v3, v[0:1]
        s_waitcnt lgkmcnt(0)
        s_setpc_b64 s[30:31]
```

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D103225/new/

https://reviews.llvm.org/D103225