[PATCH] D112492: [CUDA][HIP] Allow comdat for kernels

Wed Nov 10 08:36:16 PST 2021

yaxunl added a comment.

I did an experiment regarding the ICF issue and it seems not to affect kernel stub.

  #include "hip/hip_runtime.h"

  template<typename T>
  void bar(T x) { }

  template<typename T>
  __global__ void foo(T x) {}

  int main() {
    foo<<<1,1>>>(1);
    printf("%p\n", foo<int>);
    printf("%p\n", foo<float>);
    printf("%p\n", bar<int>);
    printf("%p\n", bar<float>);
  }

If I pass `-Wl,/opt:noicf`, I got

  00007FF622A01100
  00007FF622A01170
  00007FF622A01360
  00007FF622A01370

By default, I got

  00007FF693521100
  00007FF693521170
  00007FF693521360
  00007FF693521360

This indicates bar<int> and bar<float> are folded but kernel stubs are not folded.

I also tried `-Wl,/opt:icf=10`, and kernel stubs are still not folded.

For HIP, since the kernel stub passes a unique kernel symbol to the internal kernel launching API, you may think the kernel stubs are not folded because they are not identical.

To imitate the CUDA case, where the address of kernel stub function itself is passed to the internal kernel launching API, I used the original patch of this review, where the kernel stub function passes the address of itself to the internal kernel launching API, therefore in a sense, the kernel stubs are all the same. Still, the kernel stubs are not folded.

Looking at the assembly of the kernel stub function:

  ; foo<int>
  .seh_proc "??$foo at H@@YAXH at Z"
  # %bb.0:
          pushq   %rsi
          .seh_pushreg %rsi
          pushq   %rdi
          .seh_pushreg %rdi
          subq    $120, %rsp
          .seh_stackalloc 120
          .seh_endprologue
          movl    %ecx, 60(%rsp)
          leaq    60(%rsp), %rax
          movq    %rax, 64(%rsp)
          leaq    104(%rsp), %rsi
          leaq    88(%rsp), %rdi
          leaq    80(%rsp), %r8
          leaq    72(%rsp), %r9
          movq    %rsi, %rcx
          movq    %rdi, %rdx
          callq   __hipPopCallConfiguration
          movq    80(%rsp), %rax
          movq    72(%rsp), %rcx
          movq    %rcx, 40(%rsp)
          movq    %rax, 32(%rsp)
          leaq    "??$foo at H@@YAXH at Z"(%rip), %rcx
          leaq    64(%rsp), %r9
          movq    %rsi, %rdx
          movq    %rdi, %r8
          callq   hipLaunchKernel
          nop
          addq    $120, %rsp
          popq    %rdi
          popq    %rsi
          retq
          .seh_endproc

  ; foo<float>
  .seh_proc "??$foo at M@@YAXM at Z"
  # %bb.0:
          pushq   %rsi
          .seh_pushreg %rsi
          pushq   %rdi
          .seh_pushreg %rdi
          subq    $120, %rsp
          .seh_stackalloc 120
          .seh_endprologue
          movss   %xmm0, 60(%rsp)
          leaq    60(%rsp), %rax
          movq    %rax, 64(%rsp)
          leaq    104(%rsp), %rsi
          leaq    88(%rsp), %rdi
          leaq    80(%rsp), %r8
          leaq    72(%rsp), %r9
          movq    %rsi, %rcx
          movq    %rdi, %rdx
          callq   __hipPopCallConfiguration
          movq    80(%rsp), %rax
          movq    72(%rsp), %rcx
          movq    %rcx, 40(%rsp)
          movq    %rax, 32(%rsp)
          leaq    "??$foo at M@@YAXM at Z"(%rip), %rcx
          leaq    64(%rsp), %r9
          movq    %rsi, %rdx
          movq    %rdi, %r8
          callq   hipLaunchKernel
          nop
          addq    $120, %rsp
          popq    %rdi
          popq    %rsi
          retq
          .seh_endproc

I think they are not folded because link.exe is smart enough to treat them as not identical comdat functions. I think we may stop worrying about the ICF foading kernel stubs.

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112492/new/

https://reviews.llvm.org/D112492