[llvm-dev] Disastrous performance of 128/128-bit integer division

Sun Sep 6 04:29:40 PDT 2020

<https://compiler-rt.llvm.org/index.html> boasts:

| The builtins library provides optimized implementations of this
| and other low-level routines, either in target-independent C form,
| or as a heavily-optimized assembly.

Compile the following source with both GCC -m64 -O3 and clang -m64 -O3,
run both programs at least on an AMD Ryzen and on different Intel Core
processors, compare the measured execution times ... then remove every
occurance of the word "optimized" on the above web page.

--- 128-bit.c ---
// Copyright © 2018-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>

#include <stdint.h>
#include <stdio.h>
#include <time.h>

extern
__uint128_t __udivmodti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder);

__attribute__((noinline))
__uint128_t __unopti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder)
{
    if (remainder != NULL)
        *remainder = divisor;

    return dividend;
}

__attribute__((always_inline))
__uint128_t lfsr128(void)
{
    // 128-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0x5DB2B62B0C5F8E1B:D8CCE715FCB2726D,
    //   initialised with 2**128 / golden ratio

    const  __uint128_t poly = (__uint128_t) 0x5DB2B62B0C5F8E1B << 64 | 0xD8CCE715FCB2726D;
    static __uint128_t lfsr = (__uint128_t) 0x9E3779B97F4A7C15 << 64 | 0xF39CC0605CEDC834;
    const  __uint128_t sign = (__int128_t) lfsr >> 127;

    return lfsr = (lfsr << 1) ^ (poly & sign);
}

__attribute__((always_inline))
uint64_t lfsr64(void)
{
    // 64-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
    //   initialised with 2**64 / golden ratio

    static uint64_t lfsr = 0x9E3779B97F4A7C15;
    const  uint64_t sign = (int64_t) lfsr >> 63;

    return lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);
}

__attribute__((always_inline))
uint32_t lfsr32(void)
{
    // 32-bit linear feedback shift register (Galois form) using
    //  primitive polynomial 0xDB710641 (CRC-32 IEEE),
    //   initialised with 2**32 / golden ratio

    static uint32_t lfsr = 0x9E3779B9;
    const  uint32_t sign = (int32_t) lfsr >> 31;

    return lfsr = (lfsr << 1) ^ (0xDB710641 & sign);
}

int main(void)
{
    clock_t t0, t1, t2, tt;
    uint32_t n;
    __uint128_t dividend, divisor = ~0, remainder;
    volatile __uint128_t quotient;

    t0 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr128();
        quotient = __unopti4(dividend, divisor, NULL);
        divisor = lfsr64();
        quotient = __unopti4(dividend, divisor, &remainder);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr128();
        quotient = __udivmodti4(dividend, divisor, NULL);
        divisor = lfsr64();
        quotient = __udivmodti4(dividend, divisor, &remainder);
    }

    t2 = clock();
    tt = t2 - t0;
    t2 -= t1;
    t1 -= t0;
    t0 = t2 - t1;

    printf("\n"
           "__unopti4()       %4lu.%06lu       0\n"
           "__udivmodti4()    %4lu.%06lu    %4lu.%06lu\n"
           "                  %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);

    t0 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr128();
        quotient = __unopti4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __unopti4(dividend, divisor, &remainder);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr128();
        quotient = __udivmodti4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __udivmodti4(dividend, divisor, &remainder);
    }

    t2 = clock();
    tt = t2 - t0;
    t2 -= t1;
    t1 -= t0;
    t0 = t2 - t1;

    printf("\n"
           "__unopti4()       %4lu.%06lu       0\n"
           "__udivmodti4()    %4lu.%06lu    %4lu.%06lu\n"
           "                  %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);

    t0 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr64();
        quotient = __unopti4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __unopti4(dividend, divisor, &remainder);
    }

    t1 = clock();

    for (n = 500000000u; n > 0u; --n)
    {
        dividend = lfsr64();
        quotient = __udivmodti4(dividend, divisor, NULL);
        divisor = lfsr32();
        quotient = __udivmodti4(dividend, divisor, &remainder);
    }

    t2 = clock();
    tt = t2 - t0;
    t2 -= t1;
    t1 -= t0;
    t0 = t2 - t1;

    printf("\n"
           "__unopti4()       %4lu.%06lu       0\n"
           "__udivmodti4()    %4lu.%06lu    %4lu.%06lu\n"
           "                  %4lu.%06lu nano-seconds\n",
           t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
           tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
--- EOF ---

AMD Ryzen 5 3600 6-Core Processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                    GCC/libgcc                    LLVM/clang/compiler-rt

__unopti4()          1.668976       0              1.695685       0
__udivmodti4()      12.617999      10.949023     167.891651     166.195966
                    14.286975 nano-seconds       169.587336 nano-seconds

__unopti4()          1.760362       0              1.708451       0
__udivmodti4()      18.046460      16.286098     246.065291     244.356840
                    19.806822 nano-seconds       247.773742 nano-seconds

__unopti4()          1.248846       0              1.463868       0
__udivmodti4()       7.155582       5.906736      10.833658       9.369790
                     8.404428 nano-seconds        12.297526 nano-seconds

The factor 15 demonstrated here can be bumped up^Wdown to 30 if need be!