<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Very high overhead when calling sin() with int64_t"

   href="https://llvm.org/bugs/show_bug.cgi?id=25277">25277</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Very high overhead when calling sin() with int64_t

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>clang

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>3.7

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>LLVM Codegen

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>yyc1992@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>In the following code, when compiling with `clang++ -O2` or `-Ofast`, test_i64

takes almost twice as long as test_f64 with similar arguments. (the `extern

"C"` etc are used to benchmark it in Julia. The inline asm is to make sure the

compiler doesn't optimize the whole thing away in `-Ofast` and doesn't seem to

have a performance impact otherwise.)

```

#include <math.h>

#include <stddef.h>

#include <stdint.h>

double __attribute__((noinline))

g(int64_t a)

{

    asm volatile("" ::: "memory");

    return sin(a);

}

double __attribute__((noinline))

g(double a)

{

    asm volatile("" ::: "memory");

    return sin(a);

}

template<typename T>

static inline void

f(T a, size_t n)

{

    for (size_t i = 0;i < n;i++) {

        g(a);

    }

}

extern "C" void

test_i64(int64_t a, size_t n)

{

    f(a, n);

}

extern "C" void

test_f64(double a, size_t n)

{

    f(a, n);

}

```

On my machine, each loop takes ~ 9ns for double and ~ 14ns for int64_t. Gcc

doesn't seem to have this issue.

Gcc asm,

```

00000000000007a0 <_Z1gl>:

 7a0:   66 0f ef c0             pxor   %xmm0,%xmm0

 7a4:   f2 48 0f 2a c7          cvtsi2sd %rdi,%xmm0

 7a9:   e9 e2 fe ff ff          jmpq   690 <sin@plt>

 7ae:   66 90                   xchg   %ax,%ax

00000000000007b0 <_Z1gd>:

 7b0:   e9 db fe ff ff          jmpq   690 <sin@plt>

 7b5:   90                      nop

 7b6:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)

 7bd:   00 00 00 

00000000000007c0 <test_i64>:

 7c0:   48 85 f6                test   %rsi,%rsi

 7c3:   74 28                   je     7ed <test_i64+0x2d>

 7c5:   41 54                   push   %r12

 7c7:   49 89 fc                mov    %rdi,%r12

 7ca:   55                      push   %rbp

 7cb:   48 89 f5                mov    %rsi,%rbp

 7ce:   53                      push   %rbx

 7cf:   31 db                   xor    %ebx,%ebx

 7d1:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)

 7d8:   4c 89 e7                mov    %r12,%rdi

 7db:   48 83 c3 01             add    $0x1,%rbx

 7df:   e8 9c fe ff ff          callq  680 <_Z1gl@plt>

 7e4:   48 39 dd                cmp    %rbx,%rbp

 7e7:   75 ef                   jne    7d8 <test_i64+0x18>

 7e9:   5b                      pop    %rbx

 7ea:   5d                      pop    %rbp

 7eb:   41 5c                   pop    %r12

 7ed:   f3 c3                   repz retq 

 7ef:   90                      nop

00000000000007f0 <test_f64>:

 7f0:   55                      push   %rbp

 7f1:   53                      push   %rbx

 7f2:   48 83 ec 18             sub    $0x18,%rsp

 7f6:   48 85 ff                test   %rdi,%rdi

 7f9:   f2 0f 11 44 24 08       movsd  %xmm0,0x8(%rsp)

 7ff:   74 23                   je     824 <test_f64+0x34>

 801:   48 89 fd                mov    %rdi,%rbp

 804:   31 db                   xor    %ebx,%ebx

 806:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)

 80d:   00 00 00 

 810:   f2 0f 10 44 24 08       movsd  0x8(%rsp),%xmm0

 816:   48 83 c3 01             add    $0x1,%rbx

 81a:   e8 51 fe ff ff          callq  670 <_Z1gd@plt>

 81f:   48 39 dd                cmp    %rbx,%rbp

 822:   75 ec                   jne    810 <test_f64+0x20>

 824:   48 83 c4 18             add    $0x18,%rsp

 828:   5b                      pop    %rbx

 829:   5d                      pop    %rbp

 82a:   c3                      retq   

```

clang asm

```

0000000000000750 <_Z1gl>:

 750:   f2 48 0f 2a c7          cvtsi2sd %rdi,%xmm0

 755:   e9 e6 fe ff ff          jmpq   640 <sin@plt>

 75a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000760 <_Z1gd>:

 760:   e9 db fe ff ff          jmpq   640 <sin@plt>

 765:   66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)

 76c:   00 00 00 00 

0000000000000770 <test_i64>:

 770:   41 56                   push   %r14

 772:   53                      push   %rbx

 773:   50                      push   %rax

 774:   48 89 f3                mov    %rsi,%rbx

 777:   49 89 fe                mov    %rdi,%r14

 77a:   48 85 db                test   %rbx,%rbx

 77d:   74 0e                   je     78d <test_i64+0x1d>

 77f:   90                      nop

 780:   4c 89 f7                mov    %r14,%rdi

 783:   e8 a8 fe ff ff          callq  630 <_Z1gl@plt>

 788:   48 ff cb                dec    %rbx

 78b:   75 f3                   jne    780 <test_i64+0x10>

 78d:   48 83 c4 08             add    $0x8,%rsp

 791:   5b                      pop    %rbx

 792:   41 5e                   pop    %r14

 794:   c3                      retq   

 795:   66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)

 79c:   00 00 00 00 

00000000000007a0 <test_f64>:

 7a0:   53                      push   %rbx

 7a1:   48 83 ec 10             sub    $0x10,%rsp

 7a5:   48 89 fb                mov    %rdi,%rbx

 7a8:   f2 0f 11 44 24 08       movsd  %xmm0,0x8(%rsp)

 7ae:   48 85 db                test   %rbx,%rbx

 7b1:   74 1d                   je     7d0 <test_f64+0x30>

 7b3:   66 66 66 66 2e 0f 1f    data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)

 7ba:   84 00 00 00 00 00 

 7c0:   f2 0f 10 44 24 08       movsd  0x8(%rsp),%xmm0

 7c6:   e8 55 fe ff ff          callq  620 <_Z1gd@plt>

 7cb:   48 ff cb                dec    %rbx

 7ce:   75 f0                   jne    7c0 <test_f64+0x20>

 7d0:   48 83 c4 10             add    $0x10,%rsp

 7d4:   5b                      pop    %rbx

 7d5:   c3                      retq   

```

One noticable difference is that gcc clears the xmm0 register before doing the

i64->f64 conversion (not sure if that's the important difference).

(The same happens in JIT environment as well but it seems to be x86 (or x64)

specific...)</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>