[llvm-bugs] [Bug 44593] New: [AArch64] Unnecessary spills in fully unrolled loop

Sun Jan 19 17:14:54 PST 2020

https://bugs.llvm.org/show_bug.cgi?id=44593

            Bug ID: 44593
           Summary: [AArch64] Unnecessary spills in fully unrolled loop
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: cesarb at cesarb.eti.br
                CC: llvm-bugs at lists.llvm.org

The following code has several unnecessary spills and reloads:

    #define LEN 32

    #include <stddef.h>
    #include <stdint.h>

    struct buf {
        uint8_t buf[LEN];
    };

    uint8_t fn(struct buf *a, struct buf *b) {
        uint8_t tmp = 0;
        for (size_t i = 0; i < LEN; i++)
            tmp |= a->buf[i] ^ b->buf[i];
        return tmp;
    }

(The original code, in Rust, is at
https://github.com/cesarb/constant_time_eq/blob/0.1.5/src/lib.rs)

The generated code on clang 9.0.0 (from godbolt.org with -O3
--target=aarch64-unknown-linux-gnu) is:

fn:                                     // @fn
        sub     sp, sp, #128            // =128
        stp     x28, x27, [sp, #32]     // 16-byte Folded Spill
        stp     x26, x25, [sp, #48]     // 16-byte Folded Spill
        stp     x24, x23, [sp, #64]     // 16-byte Folded Spill
        stp     x22, x21, [sp, #80]     // 16-byte Folded Spill
        stp     x20, x19, [sp, #96]     // 16-byte Folded Spill
        stp     x29, x30, [sp, #112]    // 16-byte Folded Spill
        ldrb    w8, [x0]
        ldrb    w9, [x1]
        ldrb    w10, [x0, #1]
        ldrb    w11, [x1, #1]
        ldrb    w12, [x0, #2]
        ldrb    w13, [x1, #2]
        ldrb    w14, [x0, #3]
        ldrb    w15, [x1, #3]
        ldrb    w16, [x0, #4]
        ldrb    w17, [x1, #4]
        ldrb    w18, [x0, #5]
        ldrb    w2, [x1, #5]
        eor     w8, w9, w8
        eor     w9, w11, w10
        ldrb    w30, [x0, #14]
        stp     w9, w8, [sp, #24]       // 8-byte Folded Spill
        ldrb    w8, [x1, #14]
        ldrb    w10, [x0, #15]
        eor     w9, w13, w12
        ldrb    w12, [x1, #15]
        str     w9, [sp, #20]           // 4-byte Folded Spill
        eor     w9, w15, w14
        str     w9, [sp, #16]           // 4-byte Folded Spill
        eor     w9, w17, w16
        str     w9, [sp, #12]           // 4-byte Folded Spill
        eor     w9, w2, w18
        str     w9, [sp, #8]            // 4-byte Folded Spill
        eor     w17, w8, w30
        eor     w15, w12, w10
        ldrb    w11, [x0, #29]
        ldrb    w10, [x1, #29]
        ldrb    w9, [x0, #30]
        ldrb    w8, [x1, #30]
        ldrb    w3, [x0, #6]
        eor     w10, w10, w11
        ldrb    w4, [x1, #6]
        eor     w8, w8, w9
        ldp     w11, w9, [sp, #24]      // 8-byte Folded Reload
        ldrb    w5, [x0, #7]
        ldrb    w6, [x1, #7]
        ldrb    w7, [x0, #8]
        orr     w9, w11, w9
        ldr     w11, [sp, #20]          // 4-byte Folded Reload
        ldrb    w19, [x1, #8]
        ldrb    w20, [x0, #9]
        ldrb    w21, [x1, #9]
        orr     w9, w11, w9
        ldr     w11, [sp, #16]          // 4-byte Folded Reload
        ldrb    w22, [x0, #10]
        ldrb    w23, [x1, #10]
        eor     w2, w4, w3
        orr     w9, w11, w9
        ldr     w11, [sp, #12]          // 4-byte Folded Reload
        ldrb    w24, [x0, #11]
        ldrb    w25, [x1, #11]
        eor     w4, w6, w5
        orr     w9, w11, w9
        ldr     w11, [sp, #8]           // 4-byte Folded Reload
        ldrb    w26, [x0, #12]
        ldrb    w27, [x1, #12]
        eor     w6, w19, w7
        orr     w9, w11, w9
        orr     w9, w2, w9
        orr     w9, w4, w9
        ldrb    w28, [x0, #13]
        ldrb    w29, [x1, #13]
        eor     w19, w21, w20
        orr     w9, w6, w9
        eor     w21, w23, w22
        orr     w9, w19, w9
        ldrb    w14, [x0, #16]
        ldrb    w16, [x1, #16]
        eor     w23, w25, w24
        orr     w9, w21, w9
        ldrb    w18, [x0, #17]
        ldrb    w3, [x1, #17]
        ldrb    w5, [x0, #18]
        ldrb    w7, [x1, #18]
        eor     w25, w27, w26
        orr     w9, w23, w9
        eor     w27, w29, w28
        orr     w9, w25, w9
        orr     w9, w27, w9
        ldrb    w20, [x0, #19]
        ldrb    w22, [x1, #19]
        ldrb    w24, [x0, #20]
        ldrb    w26, [x1, #20]
        ldrb    w28, [x0, #21]
        ldrb    w29, [x1, #21]
        ldrb    w12, [x0, #22]
        eor     w14, w16, w14
        ldrb    w16, [x1, #22]
        orr     w9, w17, w9
        eor     w18, w3, w18
        ldrb    w3, [x0, #23]
        eor     w5, w7, w5
        ldrb    w7, [x1, #23]
        orr     w9, w15, w9
        orr     w9, w14, w9
        orr     w9, w18, w9
        eor     w20, w22, w20
        ldrb    w22, [x0, #24]
        eor     w24, w26, w24
        ldrb    w26, [x1, #24]
        eor     w28, w29, w28
        ldrb    w29, [x0, #25]
        eor     w13, w16, w12
        ldrb    w16, [x1, #25]
        orr     w9, w5, w9
        eor     w3, w7, w3
        ldrb    w7, [x0, #26]
        ldrb    w30, [x1, #26]
        orr     w9, w20, w9
        orr     w9, w24, w9
        orr     w9, w28, w9
        eor     w22, w26, w22
        eor     w16, w16, w29
        ldrb    w26, [x0, #27]
        ldrb    w29, [x1, #27]
        orr     w9, w13, w9
        eor     w7, w30, w7
        ldrb    w30, [x0, #28]
        ldrb    w12, [x1, #28]
        orr     w9, w3, w9
        orr     w9, w22, w9
        orr     w9, w16, w9
        eor     w26, w29, w26
        orr     w9, w7, w9
        ldrb    w11, [x0, #31]
        ldrb    w13, [x1, #31]
        eor     w12, w12, w30
        orr     w9, w26, w9
        orr     w9, w12, w9
        ldp     x29, x30, [sp, #112]    // 16-byte Folded Reload
        ldp     x20, x19, [sp, #96]     // 16-byte Folded Reload
        ldp     x22, x21, [sp, #80]     // 16-byte Folded Reload
        ldp     x24, x23, [sp, #64]     // 16-byte Folded Reload
        ldp     x26, x25, [sp, #48]     // 16-byte Folded Reload
        ldp     x28, x27, [sp, #32]     // 16-byte Folded Reload
        orr     w9, w10, w9
        orr     w8, w8, w9
        eor     w9, w13, w11
        orr     w0, w9, w8
        add     sp, sp, #128            // =128
        ret

It should be possible to avoid all these spills in the middle. The Rust code
also has a 16-element and a 64-element version; with 16 elements (#define LEN
16 in the C code) the spills do not happen, but only because there are enough
registers; with 64 elements (#define LEN 64 in the C code) the spills do not
happen because the code uses the NEON registers instead. GCC uses the NEON
registers on all three versions of the code.

The expected result would be to either vectorize also the 32-element and the
16-element versions (like both GCC and x86-64 clang do), or at least reorder
the operations in the fully unrolled loop so there are no unnecessary spills.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20200120/13a4b114/attachment.html>