[PATCH] D116879: [llvm] Allow auto-vectorization of sincos() using libmvec

Mon Jan 10 12:37:02 PST 2022

tim.schmielau added a comment.

I have beefed up my testcase to demonstrate why I had to choose the `_ZGVdN4vvv_sincos()` variant for correctness, even though `_ZGVdN4vl8l8_sincos()` would be preferable from a performance perspective:
We have no control over what pointers the user is passing in in different loop iterations.

sincosarr.cpp:

  #include <math.h>

  void sincos_arr(double* sines, double* cosines, double* phases, int* indices, int size) {
  #pragma unroll 1
      for (int i=0; i<size; i++) {
          sincos(phases[indices[i]], sines+indices[i], cosines+indices[i]);
      }
  }

main.cpp:

  include <stdio.h>
  #include <math.h>

  void sincos_arr(double* sins, double* coses, double* phases, int* indices, int size);

  int main()
  {
      const int N=32;
      int indices[N];
      double phases[N], sins[N], coses[N];
      for (int i=0; i<N; i++) {
          phases[i] = i;
          indices[i] = (i < 2) ? 1 : (indices[i-2] + indices[i-1]) % N;
      }
      sincos_arr(sins, coses, phases, indices, N);
      for (int i=0; i<N; i++) {
          int j = indices[i];
          printf("sin(%2d) == %10f == %10f | cos(%2d) == %10f == %10f\n",
                 j, sin(phases[j]), sins[j],
                 j, cos(phases[j]), coses[j]);
      }
      return 0;
  }

Inner loop x86 assembly from `clang++ -march=core-avx2 -fveclib=libmvec -O2 -S sincosarr.cpp`:

      .p2align    4, 0x90
  .LBB0_4:                                # =>This Inner Loop Header: Depth=1
      vpmovsxdq   (%r14,%r12), %ymm1
      vpextrq $1, %xmm1, %rax
      vextracti128    $1, %ymm1, %xmm0
      vpextrq $1, %xmm0, %rcx
      vmovq   %xmm0, %rdx
      vmovsd  (%rbx,%rdx,8), %xmm0            # xmm0 = mem[0],zero
      vmovhps (%rbx,%rcx,8), %xmm0, %xmm0     # xmm0 = xmm0[0,1],mem[0,1]
      vmovq   %xmm1, %rcx
      vmovsd  (%rbx,%rcx,8), %xmm2            # xmm2 = mem[0],zero
      vmovhps (%rbx,%rax,8), %xmm2, %xmm2     # xmm2 = xmm2[0,1],mem[0,1]
      vinsertf128 $1, %xmm0, %ymm2, %ymm0
      vpsllq  $3, %ymm1, %ymm2
      vpaddq  48(%rsp), %ymm2, %ymm1          # 32-byte Folded Reload
      vpaddq  16(%rsp), %ymm2, %ymm2          # 32-byte Folded Reload
      callq   _ZGVdN4vvv_sincos
      addq    $16, %r12
      cmpq    %r12, %r15
      jne .LBB0_4

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D116879/new/

https://reviews.llvm.org/D116879