[llvm-bugs] [Bug 38217] New: Clang/LLVM optimizes division and modulo worse than MSVC, part 2

Wed Jul 18 19:58:10 PDT 2018

https://bugs.llvm.org/show_bug.cgi?id=38217

            Bug ID: 38217
           Summary: Clang/LLVM optimizes division and modulo worse than
                    MSVC, part 2
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: sfinae at hotmail.com
                CC: llvm-bugs at lists.llvm.org

Created attachment 20570
  --> https://bugs.llvm.org/attachment.cgi?id=20570&action=edit
Test case

This appears to be a different bug than
https://bugs.llvm.org/show_bug.cgi?id=37983 "Clang/LLVM optimizes division and
modulo worse than MSVC" (which is probably a duplicate of
https://bugs.llvm.org/show_bug.cgi?id=23106 "Division followed by modulo
generates longer machine code than vice versa") because it involves modulo
followed by division.

This affects the Ryu algorithm for printing floating-point numbers
(https://github.com/ulfjack/ryu ) and therefore affects C++17 floating-point
std::to_chars().

I observe that MSVC's codegen is unaffected by WORKAROUND, while Clang/LLVM
generates less assembly code (which is faster when profiled in the real
algorithm) for WORKAROUND.

Here's a Godbolt link demonstrating the codegen difference (this isn't
Windows-specific): https://godbolt.org/g/uX1AD8

C:\Temp\TESTING_X64>cl
Microsoft (R) C/C++ Optimizing Compiler Version 19.16.26504 for x64
Copyright (C) Microsoft Corporation.  All rights reserved.

usage: cl [ option... ] filename... [ /link linkoption... ]

C:\Temp\TESTING_X64>clang-cl -m64 -v
clang version 6.0.0 (tags/RELEASE_600/final)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: S:\msvc\src\vctools\NonShip\ClangLLVM\bin

C:\Temp\TESTING_X64>type d2s.cpp
#include <stdint.h>
#include <string.h>

static const char DIGIT_TABLE[] =
"0001020304050607080910111213141516171819"
"2021222324252627282930313233343536373839"
"4041424344454647484950515253545556575859"
"6061626364656667686970717273747576777879"
"8081828384858687888990919293949596979899";

void d2s_buffered(uint64_t output, char * result) {
    uint32_t i = 0;

    while (output >= 10000) {
#ifdef WORKAROUND
        const uint32_t c = (uint32_t) (output - 10000 * (output / 10000));
#else
        const uint32_t c = (uint32_t) (output % 10000);
#endif

        output /= 10000;
        const uint32_t c0 = (c % 100) << 1;
        const uint32_t c1 = (c / 100) << 1;
        memcpy(result - i - 1, DIGIT_TABLE + c0, 2);
        memcpy(result - i - 3, DIGIT_TABLE + c1, 2);
        i += 4;
    }
}

C:\Temp\TESTING_X64>cl /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FAsc
/Famsvc_workaround.cod /DWORKAROUND
d2s.cpp

C:\Temp\TESTING_X64>cl /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FAsc
/Famsvc_modulo.cod
d2s.cpp

C:\Temp\TESTING_X64>git diff msvc_workaround.cod msvc_modulo.cod

diff --git a/msvc_workaround.cod b/msvc_modulo.cod
index 1be1419..aff234c 100644
--- a/msvc_workaround.cod
+++ b/msvc_modulo.cod
@@ -86,11 +86,11 @@ $LL2 at d2s_buffer:

 ; 15   : #ifdef WORKAROUND
 ; 16   :         const uint32_t c = (uint32_t) (output - 10000 * (output /
10000));
+; 17   : #else^M
+; 18   :         const uint32_t c = (uint32_t) (output % 10000);^M

   00030        48 8b c7         mov     rax, rdi

-; 17   : #else
-; 18   :         const uint32_t c = (uint32_t) (output % 10000);
 ; 19   : #endif
 ; 20   :
 ; 21   :         output /= 10000;

C:\Temp\TESTING_X64>clang-cl -m64 /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FA
/Faclang_workaround.asm /DWORKAROUND

C:\Temp\TESTING_X64>clang-cl -m64 /EHsc /nologo /W4 /MT /O2 /c d2s.cpp /FA
/Faclang_modulo.asm

C:\Temp\TESTING_X64>git diff clang_workaround.asm clang_modulo.asm
diff --git a/clang_workaround.asm b/clang_modulo.asm
index 2a8cb49..6026638 100644
--- a/clang_workaround.asm
+++ b/clang_modulo.asm
@@ -29,19 +29,22 @@
        movq    %r9, %rax
        mulq    %r10
        shrq    $11, %rdx
-       imulq   $-10000, %rdx, %rax     # imm = 0xD8F0
-       addq    %r9, %rax
-       movl    %eax, %esi
-       imulq   $1374389535, %rsi, %rsi # imm = 0x51EB851F
-       shrq    $37, %rsi
-       imull   $100, %esi, %edi
-       subl    %edi, %eax
+       imulq   $10000, %rdx, %rax      # imm = 0x2710
+       movq    %r9, %rsi
+       subq    %rax, %rsi
+       imulq   $1374389535, %rsi, %rax # imm = 0x51EB851F
+       movq    %rax, %rdi
+       shrq    $37, %rdi
+       imull   $100, %edi, %edi
+       subl    %edi, %esi
+       shrq    $36, %rax
+       andl    $510, %eax              # imm = 0x1FE
        movl    %ecx, %edi
        movq    %r8, %rbx
        subq    %rdi, %rbx
-       movzwl  (%r11,%rax,2), %eax
-       movw    %ax, -1(%rbx)
-       movzwl  (%r11,%rsi,2), %eax
+       movzwl  (%r11,%rsi,2), %esi
+       movw    %si, -1(%rbx)
+       movzwl  (%rax,%r11), %eax
        movw    %ax, -3(%rbx)
        addl    $4, %ecx
        cmpq    $99999999, %r9          # imm = 0x5F5E0FF

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180719/543bccb3/attachment.html>