[llvm-dev] Abysmal performance of 64/64-bit integer division

Sun Sep 6 04:33:39 PDT 2020

<https://compiler-rt.llvm.org/index.html> boasts:

| The builtins library provides optimized implementations of this
| and other low-level routines, either in target-independent C form,
| or as a heavily-optimized assembly.

Compile the following source with both GCC -m32 -O3 and clang -m32 -O3,
run both programs at least on an AMD Ryzen and on different Intel Core
processors, and compare the measured execution times ... then remove
every occurance of the word "optimized" on the above web page.

--- 64-bit.c ---
// Copyright © 2018-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>

#include <stdint.h>
#include <stdio.h>
#include <time.h>

extern
uint64_t __udivmoddi4(uint64_t dividend, uint64_t divisor, uint64_t *remainder);

__attribute__((noinline))
uint64_t __unopdi4(uint64_t dividend, uint64_t divisor, uint64_t *remainder)
{
    if (remainder != NULL)
        *remainder = divisor;

    return dividend;
}

__attribute__((always_inline))
uint64_t lfsr64(void)
{
    // 64-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
    //   initialised with 2**64 / golden ratio

    static uint64_t lfsr = 0x9E3779B97F4A7C15;
    const  uint64_t sign = (int64_t) lfsr >> 63;

    return lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);
}

__attribute__((always_inline))
uint32_t lfsr32(void)
{
    // 32-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xDB710641 (CRC-32 IEEE),
    //   initialised with 2**32 / golden ratio

    static uint32_t lfsr = 0x9E3779B9;
    const  uint32_t sign = (int32_t) lfsr >> 31;

    return lfsr = (lfsr << 1) ^ (0xDB710641 & sign);
}

int main(void)
{
    clock_t t0, t1, t2, tt;
    uint32_t n;
    uint64_t dividend, divisor = ~0, remainder;
    volatile uint64_t quotient;

    t0 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr64();
        quotient = __unopdi4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __unopdi4(dividend, divisor, &remainder);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr64();
        quotient = __udivmoddi4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __udivmoddi4(dividend, divisor, &remainder);
    }

    t2 = clock();
    tt = t2 - t0;
    t2 -= t1;
    t1 -= t0;
    t0 = t2 - t1;

    printf("\n"
           "__unopdi4()       %4lu.%06lu       0\n"
           "__udivmoddi4()    %4lu.%06lu    %4lu.%06lu\n"
           "                  %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
--- EOF ---

On an AMD Ryzen 5 3600 this demonstrates a factor 11, which but can be
"raised" to 15 if need be.