[llvm-dev] Disastrous performance of 128/128-bit integer division
Stefan Kanthak via llvm-dev
llvm-dev at lists.llvm.org
Sun Sep 6 04:29:40 PDT 2020
<https://compiler-rt.llvm.org/index.html> boasts:
| The builtins library provides optimized implementations of this
| and other low-level routines, either in target-independent C form,
| or as a heavily-optimized assembly.
Compile the following source with both GCC -m64 -O3 and clang -m64 -O3,
run both programs at least on an AMD Ryzen and on different Intel Core
processors, compare the measured execution times ... then remove every
occurance of the word "optimized" on the above web page.
--- 128-bit.c ---
// Copyright © 2018-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>
#include <stdint.h>
#include <stdio.h>
#include <time.h>
extern
__uint128_t __udivmodti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder);
__attribute__((noinline))
__uint128_t __unopti4(__uint128_t dividend, __uint128_t divisor, __uint128_t *remainder)
{
if (remainder != NULL)
*remainder = divisor;
return dividend;
}
__attribute__((always_inline))
__uint128_t lfsr128(void)
{
// 128-bit linear feedback shift register (Galois form) using
// primitive polynomial 0x5DB2B62B0C5F8E1B:D8CCE715FCB2726D,
// initialised with 2**128 / golden ratio
const __uint128_t poly = (__uint128_t) 0x5DB2B62B0C5F8E1B << 64 | 0xD8CCE715FCB2726D;
static __uint128_t lfsr = (__uint128_t) 0x9E3779B97F4A7C15 << 64 | 0xF39CC0605CEDC834;
const __uint128_t sign = (__int128_t) lfsr >> 127;
return lfsr = (lfsr << 1) ^ (poly & sign);
}
__attribute__((always_inline))
uint64_t lfsr64(void)
{
// 64-bit linear feedback shift register (Galois form) using
// primitive polynomial 0xAD93D23594C935A9 (CRC-64 "Jones"),
// initialised with 2**64 / golden ratio
static uint64_t lfsr = 0x9E3779B97F4A7C15;
const uint64_t sign = (int64_t) lfsr >> 63;
return lfsr = (lfsr << 1) ^ (0xAD93D23594C935A9 & sign);
}
__attribute__((always_inline))
uint32_t lfsr32(void)
{
// 32-bit linear feedback shift register (Galois form) using
// primitive polynomial 0xDB710641 (CRC-32 IEEE),
// initialised with 2**32 / golden ratio
static uint32_t lfsr = 0x9E3779B9;
const uint32_t sign = (int32_t) lfsr >> 31;
return lfsr = (lfsr << 1) ^ (0xDB710641 & sign);
}
int main(void)
{
clock_t t0, t1, t2, tt;
uint32_t n;
__uint128_t dividend, divisor = ~0, remainder;
volatile __uint128_t quotient;
t0 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr128();
quotient = __unopti4(dividend, divisor, NULL);
divisor = lfsr64();
quotient = __unopti4(dividend, divisor, &remainder);
}
t1 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr128();
quotient = __udivmodti4(dividend, divisor, NULL);
divisor = lfsr64();
quotient = __udivmodti4(dividend, divisor, &remainder);
}
t2 = clock();
tt = t2 - t0;
t2 -= t1;
t1 -= t0;
t0 = t2 - t1;
printf("\n"
"__unopti4() %4lu.%06lu 0\n"
"__udivmodti4() %4lu.%06lu %4lu.%06lu\n"
" %4lu.%06lu nano-seconds\n",
t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
t0 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr128();
quotient = __unopti4(dividend, divisor, NULL);
divisor = lfsr32();
quotient = __unopti4(dividend, divisor, &remainder);
}
t1 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr128();
quotient = __udivmodti4(dividend, divisor, NULL);
divisor = lfsr32();
quotient = __udivmodti4(dividend, divisor, &remainder);
}
t2 = clock();
tt = t2 - t0;
t2 -= t1;
t1 -= t0;
t0 = t2 - t1;
printf("\n"
"__unopti4() %4lu.%06lu 0\n"
"__udivmodti4() %4lu.%06lu %4lu.%06lu\n"
" %4lu.%06lu nano-seconds\n",
t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
t0 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr64();
quotient = __unopti4(dividend, divisor, NULL);
divisor = lfsr32();
quotient = __unopti4(dividend, divisor, &remainder);
}
t1 = clock();
for (n = 500000000u; n > 0u; --n)
{
dividend = lfsr64();
quotient = __udivmodti4(dividend, divisor, NULL);
divisor = lfsr32();
quotient = __udivmodti4(dividend, divisor, &remainder);
}
t2 = clock();
tt = t2 - t0;
t2 -= t1;
t1 -= t0;
t0 = t2 - t1;
printf("\n"
"__unopti4() %4lu.%06lu 0\n"
"__udivmodti4() %4lu.%06lu %4lu.%06lu\n"
" %4lu.%06lu nano-seconds\n",
t1 / CLOCKS_PER_SEC, (t1 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t2 / CLOCKS_PER_SEC, (t2 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
t0 / CLOCKS_PER_SEC, (t0 % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC,
tt / CLOCKS_PER_SEC, (tt % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
--- EOF ---
AMD Ryzen 5 3600 6-Core Processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
GCC/libgcc LLVM/clang/compiler-rt
__unopti4() 1.668976 0 1.695685 0
__udivmodti4() 12.617999 10.949023 167.891651 166.195966
14.286975 nano-seconds 169.587336 nano-seconds
__unopti4() 1.760362 0 1.708451 0
__udivmodti4() 18.046460 16.286098 246.065291 244.356840
19.806822 nano-seconds 247.773742 nano-seconds
__unopti4() 1.248846 0 1.463868 0
__udivmodti4() 7.155582 5.906736 10.833658 9.369790
8.404428 nano-seconds 12.297526 nano-seconds
The factor 15 demonstrated here can be bumped up^Wdown to 30 if need be!
More information about the llvm-dev
mailing list