[llvm-bugs] [Bug 41929] New: [MIPS] All word-sized multiplies against constants are transformed into shift + add

Fri May 17 18:05:24 PDT 2019

https://bugs.llvm.org/show_bug.cgi?id=41929

            Bug ID: 41929
           Summary: [MIPS] All word-sized multiplies against constants are
                    transformed into shift + add
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: MIPS
          Assignee: unassignedbugs at nondot.org
          Reporter: husseydevin at gmail.com
                CC: llvm-bugs at lists.llvm.org

https://github.com/llvm-mirror/llvm/blob/36cbe77a6eee47bfaebd17a713decac4433d6ae8/lib/Target/Mips/MipsSEISelLowering.cpp#L718

I've tested this code and the logic seems fishy. From the looks of it, this
always returns true for integers that match the word size.

For example, take this code.

unsigned multiply(unsigned x)
{
    return x * 0xAAAAAAAA; // 0b10101010101010101010101010101010
}

Clearly, with alternating 1's and 0's, turning this into a shift and add is
ridiculous. Unless you're Clang.

clang -O3 --target=mipsel-linux-gnu -S multiply.c

multiply:
        sll     $1, $4, 1
        sll     $2, $4, 3
        sll     $3, $4, 29
        sll     $5, $4, 27
        sll     $6, $4, 25
        sll     $7, $4, 23
        sll     $8, $4, 21
        sll     $9, $4, 19
        sll     $10, $4, 17
        addu    $1, $2, $1
        sll     $2, $4, 5
        addu    $1, $2, $1
        sll     $2, $4, 7
        addu    $1, $2, $1
        sll     $2, $4, 9
        addu    $1, $2, $1
        sll     $2, $4, 11
        addu    $1, $2, $1
        sll     $2, $4, 13
        addu    $1, $2, $1
        sll     $2, $4, 31
        sll     $4, $4, 15
        addu    $1, $4, $1
        addu    $1, $10, $1
        addu    $1, $9, $1
        addu    $1, $8, $1
        addu    $1, $7, $1
        addu    $1, $6, $1
        addu    $1, $5, $1
        addu    $1, $3, $1
        jr      $ra
        addu    $2, $2, $1

According to the code, it should stop at 8 steps. However, that is 31 steps,
far more than the latency of multiplication unless I am reading the
documentation incorrectly.

I would expect something like this:

unsigned multiply(unsigned x)
{
    unsigned a = 0xAAAAAAAA;
    __asm__("" : "+r" (a)); // disable constant propagation
    return x * a;
}

multiply:
        lui     $1, 43690
        ori     $2, $1, 43690
        jr      $ra
        mul     $2, $2, $4

It is even worse on mips64:

unsigned long long multiply64(unsigned long long x)
{
    return x * 0xAAAAAAAAAAAAAAAAULL;
}

clang -O3 -S --target=mips64el-linux-gnu multiply.c

multiply64:                             # @multiply64
        daddiu  $sp, $sp, -80
        sd      $gp, 72($sp)            # 8-byte Folded Spill
        sd      $23, 64($sp)            # 8-byte Folded Spill
        sd      $22, 56($sp)            # 8-byte Folded Spill
        sd      $21, 48($sp)            # 8-byte Folded Spill
        sd      $20, 40($sp)            # 8-byte Folded Spill
        sd      $19, 32($sp)            # 8-byte Folded Spill
        sd      $18, 24($sp)            # 8-byte Folded Spill
        sd      $17, 16($sp)            # 8-byte Folded Spill
        sd      $16, 8($sp)             # 8-byte Folded Spill
        dsll    $1, $4, 1
        dsll    $2, $4, 3
        dsll    $3, $4, 61
        dsll    $5, $4, 59
        dsll    $6, $4, 57
        dsll    $7, $4, 55
        dsll    $8, $4, 53
        dsll    $9, $4, 51
        dsll    $10, $4, 49
        dsll    $11, $4, 47
        dsll    $12, $4, 45
        dsll    $13, $4, 43
        dsll    $14, $4, 41
        dsll    $15, $4, 39
        dsll    $24, $4, 37
        dsll    $25, $4, 35
        dsll    $16, $4, 33
        dsll    $17, $4, 31
        dsll    $18, $4, 29
        dsll    $19, $4, 27
        dsll    $20, $4, 25
        dsll    $21, $4, 23
        dsll    $22, $4, 21
        dsll    $23, $4, 19
        dsll    $gp, $4, 17
        daddu   $1, $2, $1
        dsll    $2, $4, 5
        daddu   $1, $2, $1
        dsll    $2, $4, 7
        daddu   $1, $2, $1
        dsll    $2, $4, 9
        daddu   $1, $2, $1
        dsll    $2, $4, 11
        daddu   $1, $2, $1
        dsll    $2, $4, 13
        daddu   $1, $2, $1
        dsll    $2, $4, 63
        dsll    $4, $4, 15
        daddu   $1, $4, $1
        daddu   $1, $gp, $1
        ld      $gp, 72($sp)            # 8-byte Folded Reload
        daddu   $1, $23, $1
        ld      $23, 64($sp)            # 8-byte Folded Reload
        daddu   $1, $22, $1
        ld      $22, 56($sp)            # 8-byte Folded Reload
        daddu   $1, $21, $1
        ld      $21, 48($sp)            # 8-byte Folded Reload
        daddu   $1, $20, $1
        ld      $20, 40($sp)            # 8-byte Folded Reload
        daddu   $1, $19, $1
        ld      $19, 32($sp)            # 8-byte Folded Reload
        daddu   $1, $18, $1
        ld      $18, 24($sp)            # 8-byte Folded Reload
        daddu   $1, $17, $1
        ld      $17, 16($sp)            # 8-byte Folded Reload
        daddu   $1, $16, $1
        ld      $16, 8($sp)             # 8-byte Folded Reload
        daddu   $1, $25, $1
        daddu   $1, $24, $1
        daddu   $1, $15, $1
        daddu   $1, $14, $1
        daddu   $1, $13, $1
        daddu   $1, $12, $1
        daddu   $1, $11, $1
        daddu   $1, $10, $1
        daddu   $1, $9, $1
        daddu   $1, $8, $1
        daddu   $1, $7, $1
        daddu   $1, $6, $1
        daddu   $1, $5, $1
        daddu   $1, $3, $1
        daddu   $2, $2, $1
        jr      $ra
        daddiu  $sp, $sp, 80

This can have pretty nasty effects on binary size: 

xxhash's code (https://github.com/Cyan4973/xxHash), which uses plenty of
multiplies against constants, results in a binary that is more than 2x larger
than GCC 8 with size optimizations.

mipsel-linux-gnu-gcc-8 -Os -c xxhash.c
stat -c "%s" xxhash.o
30736
clang-9 -Oz -c --target=mipsel-linux-gnu xxhash.c
stat -c "%s" xxhash.o
70080

A look at the assembly will show that every multiply is converted to shifts,
and the only multu instructions are for when we need the high bits of the
result (i.e. long multiply).

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190518/c76c4417/attachment.html>