[PATCH] D89980: [hip] Remove kernel argument coercion.

Tue Nov 3 08:49:31 PST 2020

hliao added a comment.

The code could be simply converted to a kernel one following the same pattern:

  struct S {
          float *p;
          float a[64];
          int n;
  };

  __global__ void kernel(S s) {
          *s.p = s.a[s.n];
  }

Here's the LLVM IR after frontend

  define protected amdgpu_kernel void @_Z6kernel1S(%struct.S.coerce %0) #2 {
    %2 = alloca %struct.S, align 8, addrspace(5)
    %3 = addrspacecast %struct.S addrspace(5)* %2 to %struct.S*
    %4 = bitcast %struct.S* %3 to %struct.S.coerce*
    %5 = getelementptr inbounds %struct.S.coerce, %struct.S.coerce* %4, i32 0, i32 0
    %6 = extractvalue %struct.S.coerce %0, 0
    store float addrspace(1)* %6, float addrspace(1)** %5, align 8
    %7 = getelementptr inbounds %struct.S.coerce, %struct.S.coerce* %4, i32 0, i32 1
    %8 = extractvalue %struct.S.coerce %0, 1
    store [64 x float] %8, [64 x float]* %7, align 8
    %9 = getelementptr inbounds %struct.S.coerce, %struct.S.coerce* %4, i32 0, i32 2
    %10 = extractvalue %struct.S.coerce %0, 2
    store i32 %10, i32* %9, align 8
    %11 = getelementptr inbounds %struct.S, %struct.S* %3, i32 0, i32 1
    %12 = getelementptr inbounds %struct.S, %struct.S* %3, i32 0, i32 2
    %13 = load i32, i32* %12, align 8, !tbaa !12
    %14 = sext i32 %13 to i64
    %15 = getelementptr inbounds [64 x float], [64 x float]* %11, i64 0, i64 %14
    %16 = load float, float* %15, align 4, !tbaa !14
    %17 = getelementptr inbounds %struct.S, %struct.S* %3, i32 0, i32 0
    %18 = load float*, float** %17, align 8, !tbaa !16
    store float %16, float* %18, align 4, !tbaa !14
    ret void
  }

and here's the optimized IR before codegen

  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
  target triple = "amdgcn-amd-amdhsa"

  %struct.S.coerce = type { float addrspace(1)*, [64 x float], i32 }
  %struct.S = type { float*, [64 x float], i32 }

  ; Function Attrs: nofree norecurse nounwind writeonly
  define protected amdgpu_kernel void @_Z6kernel1S(%struct.S.coerce %0) local_unnamed_addr #0 {
    %2 = alloca %struct.S, align 8, addrspace(5)
    %3 = bitcast %struct.S addrspace(5)* %2 to float addrspace(1)* addrspace(5)*
    %4 = extractvalue %struct.S.coerce %0, 0
    store float addrspace(1)* %4, float addrspace(1)* addrspace(5)* %3, align 8
    %5 = extractvalue %struct.S.coerce %0, 1
    %6 = extractvalue [64 x float] %5, 0
    %7 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 0
    store float %6, float addrspace(5)* %7, align 8
    %8 = extractvalue [64 x float] %5, 1
    %9 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 1
    store float %8, float addrspace(5)* %9, align 4
    %10 = extractvalue [64 x float] %5, 2
    %11 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 2
    store float %10, float addrspace(5)* %11, align 8
    %12 = extractvalue [64 x float] %5, 3
    %13 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 3
    store float %12, float addrspace(5)* %13, align 4
    %14 = extractvalue [64 x float] %5, 4
    %15 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 4
    store float %14, float addrspace(5)* %15, align 8
    %16 = extractvalue [64 x float] %5, 5
    %17 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 5
    store float %16, float addrspace(5)* %17, align 4
    %18 = extractvalue [64 x float] %5, 6
    %19 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 6
    store float %18, float addrspace(5)* %19, align 8
    %20 = extractvalue [64 x float] %5, 7
    %21 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 7
    store float %20, float addrspace(5)* %21, align 4
    %22 = extractvalue [64 x float] %5, 8
    %23 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 8
    store float %22, float addrspace(5)* %23, align 8
    %24 = extractvalue [64 x float] %5, 9
    %25 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 9
    store float %24, float addrspace(5)* %25, align 4
    %26 = extractvalue [64 x float] %5, 10
    %27 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 10
    store float %26, float addrspace(5)* %27, align 8
    %28 = extractvalue [64 x float] %5, 11
    %29 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 11
    store float %28, float addrspace(5)* %29, align 4
    %30 = extractvalue [64 x float] %5, 12
    %31 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 12
    store float %30, float addrspace(5)* %31, align 8
    %32 = extractvalue [64 x float] %5, 13
    %33 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 13
    store float %32, float addrspace(5)* %33, align 4
    %34 = extractvalue [64 x float] %5, 14
    %35 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 14
    store float %34, float addrspace(5)* %35, align 8
    %36 = extractvalue [64 x float] %5, 15
    %37 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 15
    store float %36, float addrspace(5)* %37, align 4
    %38 = extractvalue [64 x float] %5, 16
    %39 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 16
    store float %38, float addrspace(5)* %39, align 8
    %40 = extractvalue [64 x float] %5, 17
    %41 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 17
    store float %40, float addrspace(5)* %41, align 4
    %42 = extractvalue [64 x float] %5, 18
    %43 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 18
    store float %42, float addrspace(5)* %43, align 8
    %44 = extractvalue [64 x float] %5, 19
    %45 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 19
    store float %44, float addrspace(5)* %45, align 4
    %46 = extractvalue [64 x float] %5, 20
    %47 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 20
    store float %46, float addrspace(5)* %47, align 8
    %48 = extractvalue [64 x float] %5, 21
    %49 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 21
    store float %48, float addrspace(5)* %49, align 4
    %50 = extractvalue [64 x float] %5, 22
    %51 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 22
    store float %50, float addrspace(5)* %51, align 8
    %52 = extractvalue [64 x float] %5, 23
    %53 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 23
    store float %52, float addrspace(5)* %53, align 4
    %54 = extractvalue [64 x float] %5, 24
    %55 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 24
    store float %54, float addrspace(5)* %55, align 8
    %56 = extractvalue [64 x float] %5, 25
    %57 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 25
    store float %56, float addrspace(5)* %57, align 4
    %58 = extractvalue [64 x float] %5, 26
    %59 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 26
    store float %58, float addrspace(5)* %59, align 8
    %60 = extractvalue [64 x float] %5, 27
    %61 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 27
    store float %60, float addrspace(5)* %61, align 4
    %62 = extractvalue [64 x float] %5, 28
    %63 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 28
    store float %62, float addrspace(5)* %63, align 8
    %64 = extractvalue [64 x float] %5, 29
    %65 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 29
    store float %64, float addrspace(5)* %65, align 4
    %66 = extractvalue [64 x float] %5, 30
    %67 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 30
    store float %66, float addrspace(5)* %67, align 8
    %68 = extractvalue [64 x float] %5, 31
    %69 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 31
    store float %68, float addrspace(5)* %69, align 4
    %70 = extractvalue [64 x float] %5, 32
    %71 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 32
    store float %70, float addrspace(5)* %71, align 8
    %72 = extractvalue [64 x float] %5, 33
    %73 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 33
    store float %72, float addrspace(5)* %73, align 4
    %74 = extractvalue [64 x float] %5, 34
    %75 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 34
    store float %74, float addrspace(5)* %75, align 8
    %76 = extractvalue [64 x float] %5, 35
    %77 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 35
    store float %76, float addrspace(5)* %77, align 4
    %78 = extractvalue [64 x float] %5, 36
    %79 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 36
    store float %78, float addrspace(5)* %79, align 8
    %80 = extractvalue [64 x float] %5, 37
    %81 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 37
    store float %80, float addrspace(5)* %81, align 4
    %82 = extractvalue [64 x float] %5, 38
    %83 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 38
    store float %82, float addrspace(5)* %83, align 8
    %84 = extractvalue [64 x float] %5, 39
    %85 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 39
    store float %84, float addrspace(5)* %85, align 4
    %86 = extractvalue [64 x float] %5, 40
    %87 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 40
    store float %86, float addrspace(5)* %87, align 8
    %88 = extractvalue [64 x float] %5, 41
    %89 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 41
    store float %88, float addrspace(5)* %89, align 4
    %90 = extractvalue [64 x float] %5, 42
    %91 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 42
    store float %90, float addrspace(5)* %91, align 8
    %92 = extractvalue [64 x float] %5, 43
    %93 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 43
    store float %92, float addrspace(5)* %93, align 4
    %94 = extractvalue [64 x float] %5, 44
    %95 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 44
    store float %94, float addrspace(5)* %95, align 8
    %96 = extractvalue [64 x float] %5, 45
    %97 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 45
    store float %96, float addrspace(5)* %97, align 4
    %98 = extractvalue [64 x float] %5, 46
    %99 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 46
    store float %98, float addrspace(5)* %99, align 8
    %100 = extractvalue [64 x float] %5, 47
    %101 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 47
    store float %100, float addrspace(5)* %101, align 4
    %102 = extractvalue [64 x float] %5, 48
    %103 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 48
    store float %102, float addrspace(5)* %103, align 8
    %104 = extractvalue [64 x float] %5, 49
    %105 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 49
    store float %104, float addrspace(5)* %105, align 4
    %106 = extractvalue [64 x float] %5, 50
    %107 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 50
    store float %106, float addrspace(5)* %107, align 8
    %108 = extractvalue [64 x float] %5, 51
    %109 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 51
    store float %108, float addrspace(5)* %109, align 4
    %110 = extractvalue [64 x float] %5, 52
    %111 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 52
    store float %110, float addrspace(5)* %111, align 8
    %112 = extractvalue [64 x float] %5, 53
    %113 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 53
    store float %112, float addrspace(5)* %113, align 4
    %114 = extractvalue [64 x float] %5, 54
    %115 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 54
    store float %114, float addrspace(5)* %115, align 8
    %116 = extractvalue [64 x float] %5, 55
    %117 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 55
    store float %116, float addrspace(5)* %117, align 4
    %118 = extractvalue [64 x float] %5, 56
    %119 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 56
    store float %118, float addrspace(5)* %119, align 8
    %120 = extractvalue [64 x float] %5, 57
    %121 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 57
    store float %120, float addrspace(5)* %121, align 4
    %122 = extractvalue [64 x float] %5, 58
    %123 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 58
    store float %122, float addrspace(5)* %123, align 8
    %124 = extractvalue [64 x float] %5, 59
    %125 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 59
    store float %124, float addrspace(5)* %125, align 4
    %126 = extractvalue [64 x float] %5, 60
    %127 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 60
    store float %126, float addrspace(5)* %127, align 8
    %128 = extractvalue [64 x float] %5, 61
    %129 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 61
    store float %128, float addrspace(5)* %129, align 4
    %130 = extractvalue [64 x float] %5, 62
    %131 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 62
    store float %130, float addrspace(5)* %131, align 8
    %132 = extractvalue [64 x float] %5, 63
    %133 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 63
    store float %132, float addrspace(5)* %133, align 4
    %134 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 2
    %135 = extractvalue %struct.S.coerce %0, 2
    store i32 %135, i32 addrspace(5)* %134, align 8
    %136 = getelementptr inbounds %struct.S, %struct.S addrspace(5)* %2, i32 0, i32 1, i32 %135
    %137 = bitcast float addrspace(5)* %136 to i32 addrspace(5)*
    %138 = load i32, i32 addrspace(5)* %137, align 4, !tbaa !4
    %139 = bitcast %struct.S addrspace(5)* %2 to i32* addrspace(5)*
    %140 = load i32*, i32* addrspace(5)* %139, align 8, !tbaa !8
    store i32 %138, i32* %140, align 4, !tbaa !4
    ret void
  }

and here's the optimized after this patch, the `alloca` is eliminated.

  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
  target triple = "amdgcn-amd-amdhsa"

  %struct.S = type { float*, [64 x float], i32 }

  ; Function Attrs: nofree norecurse nounwind writeonly
  define protected amdgpu_kernel void @_Z6kernel1S(%struct.S addrspace(4)* nocapture readonly byref(%struct.S) align 8 %0) local_unnamed_addr #0 {
    %2 = getelementptr %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 2
    %3 = load i32, i32 addrspace(4)* %2, align 8, !tbaa !5
    %4 = sext i32 %3 to i64
    %5 = getelementptr %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1, i64 %4
    %6 = load float, float addrspace(4)* %5, align 4, !tbaa !11
    %7 = getelementptr %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
    %8 = load float*, float* addrspace(4)* %7, align 8, !tbaa !13
    store float %6, float* %8, align 4, !tbaa !11
    ret void
  }

In D89980#2371290 <https://reviews.llvm.org/D89980#2371290>, @arsenm wrote:

> In D89980#2371270 <https://reviews.llvm.org/D89980#2371270>, @hliao wrote:
>
>> In D89980#2368506 <https://reviews.llvm.org/D89980#2368506>, @arsenm wrote:
>>
>>> I think this is a dead end approach. I don't see the connection to the original problem you are trying to solve. Can you send me an IR testcase that this is supposed to help?
>>
>> That's probably commonly known. If we pass an aggregate parameter directly by value and dynamically index it late, that `alloca` cannot be promoted as that aggregate value in LLVM IR cannot be dynamically indexed. For example,
>>
>>   struct S {
>>      int a[100];
>>      int n;
>>   };
>>   
>>   int foo(S s) {
>>     return s.a[s.n];
>>   }
>
> This example is not a kernel
>
>> If the underlying ABI chooses to pass `s` directly by value, we have the following pseudo IR.
>>
>>   %s = alloca S
>>   ; store `s` value into %s as the parameter is treated as a local variable by filling its initial value from LLVM IR parameter.
>>   ...
>>   ; regular parameter access through %s with dynamic indices
>>
>> that `store` from the parameter from LLVM IR is an aggregate value store. Later, when %s is to be promoted, as it's once dynamically indexed, we cannot promote it as dynamic index on aggregate values is not representable in LLVM IR.
>>
>> In contrast, if a parameter is passed by value indirectly, that `store` is replaced with a `memcpy`. It's straightforward to promote '%s' as they are all memory operands of the same layout.
>>
>> If you need detailed IR, I may post here for your reference.
>
> I need an actual source and IR example.  I think you are describing the missing promotion of pointers inside byref arguments. We need better promotion here, not eliminate it. It needs to cast the byref pointer, or cast the pointers inside the struct when accessed

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D89980/new/

https://reviews.llvm.org/D89980