<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - [AArch64] Unnecessary spills in fully unrolled loop"

   href="https://bugs.llvm.org/show_bug.cgi?id=44593">44593</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[AArch64] Unnecessary spills in fully unrolled loop

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>cesarb@cesarb.eti.br

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The following code has several unnecessary spills and reloads:

    #define LEN 32

    #include <stddef.h>

    #include <stdint.h>

    struct buf {

        uint8_t buf[LEN];

    };

    uint8_t fn(struct buf *a, struct buf *b) {

        uint8_t tmp = 0;

        for (size_t i = 0; i < LEN; i++)

            tmp |= a->buf[i] ^ b->buf[i];

        return tmp;

    }

(The original code, in Rust, is at

<a href="https://github.com/cesarb/constant_time_eq/blob/0.1.5/src/lib.rs">https://github.com/cesarb/constant_time_eq/blob/0.1.5/src/lib.rs</a>)

The generated code on clang 9.0.0 (from godbolt.org with -O3

--target=aarch64-unknown-linux-gnu) is:

fn:                                     // @fn

        sub     sp, sp, #128            // =128

        stp     x28, x27, [sp, #32]     // 16-byte Folded Spill

        stp     x26, x25, [sp, #48]     // 16-byte Folded Spill

        stp     x24, x23, [sp, #64]     // 16-byte Folded Spill

        stp     x22, x21, [sp, #80]     // 16-byte Folded Spill

        stp     x20, x19, [sp, #96]     // 16-byte Folded Spill

        stp     x29, x30, [sp, #112]    // 16-byte Folded Spill

        ldrb    w8, [x0]

        ldrb    w9, [x1]

        ldrb    w10, [x0, #1]

        ldrb    w11, [x1, #1]

        ldrb    w12, [x0, #2]

        ldrb    w13, [x1, #2]

        ldrb    w14, [x0, #3]

        ldrb    w15, [x1, #3]

        ldrb    w16, [x0, #4]

        ldrb    w17, [x1, #4]

        ldrb    w18, [x0, #5]

        ldrb    w2, [x1, #5]

        eor     w8, w9, w8

        eor     w9, w11, w10

        ldrb    w30, [x0, #14]

        stp     w9, w8, [sp, #24]       // 8-byte Folded Spill

        ldrb    w8, [x1, #14]

        ldrb    w10, [x0, #15]

        eor     w9, w13, w12

        ldrb    w12, [x1, #15]

        str     w9, [sp, #20]           // 4-byte Folded Spill

        eor     w9, w15, w14

        str     w9, [sp, #16]           // 4-byte Folded Spill

        eor     w9, w17, w16

        str     w9, [sp, #12]           // 4-byte Folded Spill

        eor     w9, w2, w18

        str     w9, [sp, #8]            // 4-byte Folded Spill

        eor     w17, w8, w30

        eor     w15, w12, w10

        ldrb    w11, [x0, #29]

        ldrb    w10, [x1, #29]

        ldrb    w9, [x0, #30]

        ldrb    w8, [x1, #30]

        ldrb    w3, [x0, #6]

        eor     w10, w10, w11

        ldrb    w4, [x1, #6]

        eor     w8, w8, w9

        ldp     w11, w9, [sp, #24]      // 8-byte Folded Reload

        ldrb    w5, [x0, #7]

        ldrb    w6, [x1, #7]

        ldrb    w7, [x0, #8]

        orr     w9, w11, w9

        ldr     w11, [sp, #20]          // 4-byte Folded Reload

        ldrb    w19, [x1, #8]

        ldrb    w20, [x0, #9]

        ldrb    w21, [x1, #9]

        orr     w9, w11, w9

        ldr     w11, [sp, #16]          // 4-byte Folded Reload

        ldrb    w22, [x0, #10]

        ldrb    w23, [x1, #10]

        eor     w2, w4, w3

        orr     w9, w11, w9

        ldr     w11, [sp, #12]          // 4-byte Folded Reload

        ldrb    w24, [x0, #11]

        ldrb    w25, [x1, #11]

        eor     w4, w6, w5

        orr     w9, w11, w9

        ldr     w11, [sp, #8]           // 4-byte Folded Reload

        ldrb    w26, [x0, #12]

        ldrb    w27, [x1, #12]

        eor     w6, w19, w7

        orr     w9, w11, w9

        orr     w9, w2, w9

        orr     w9, w4, w9

        ldrb    w28, [x0, #13]

        ldrb    w29, [x1, #13]

        eor     w19, w21, w20

        orr     w9, w6, w9

        eor     w21, w23, w22

        orr     w9, w19, w9

        ldrb    w14, [x0, #16]

        ldrb    w16, [x1, #16]

        eor     w23, w25, w24

        orr     w9, w21, w9

        ldrb    w18, [x0, #17]

        ldrb    w3, [x1, #17]

        ldrb    w5, [x0, #18]

        ldrb    w7, [x1, #18]

        eor     w25, w27, w26

        orr     w9, w23, w9

        eor     w27, w29, w28

        orr     w9, w25, w9

        orr     w9, w27, w9

        ldrb    w20, [x0, #19]

        ldrb    w22, [x1, #19]

        ldrb    w24, [x0, #20]

        ldrb    w26, [x1, #20]

        ldrb    w28, [x0, #21]

        ldrb    w29, [x1, #21]

        ldrb    w12, [x0, #22]

        eor     w14, w16, w14

        ldrb    w16, [x1, #22]

        orr     w9, w17, w9

        eor     w18, w3, w18

        ldrb    w3, [x0, #23]

        eor     w5, w7, w5

        ldrb    w7, [x1, #23]

        orr     w9, w15, w9

        orr     w9, w14, w9

        orr     w9, w18, w9

        eor     w20, w22, w20

        ldrb    w22, [x0, #24]

        eor     w24, w26, w24

        ldrb    w26, [x1, #24]

        eor     w28, w29, w28

        ldrb    w29, [x0, #25]

        eor     w13, w16, w12

        ldrb    w16, [x1, #25]

        orr     w9, w5, w9

        eor     w3, w7, w3

        ldrb    w7, [x0, #26]

        ldrb    w30, [x1, #26]

        orr     w9, w20, w9

        orr     w9, w24, w9

        orr     w9, w28, w9

        eor     w22, w26, w22

        eor     w16, w16, w29

        ldrb    w26, [x0, #27]

        ldrb    w29, [x1, #27]

        orr     w9, w13, w9

        eor     w7, w30, w7

        ldrb    w30, [x0, #28]

        ldrb    w12, [x1, #28]

        orr     w9, w3, w9

        orr     w9, w22, w9

        orr     w9, w16, w9

        eor     w26, w29, w26

        orr     w9, w7, w9

        ldrb    w11, [x0, #31]

        ldrb    w13, [x1, #31]

        eor     w12, w12, w30

        orr     w9, w26, w9

        orr     w9, w12, w9

        ldp     x29, x30, [sp, #112]    // 16-byte Folded Reload

        ldp     x20, x19, [sp, #96]     // 16-byte Folded Reload

        ldp     x22, x21, [sp, #80]     // 16-byte Folded Reload

        ldp     x24, x23, [sp, #64]     // 16-byte Folded Reload

        ldp     x26, x25, [sp, #48]     // 16-byte Folded Reload

        ldp     x28, x27, [sp, #32]     // 16-byte Folded Reload

        orr     w9, w10, w9

        orr     w8, w8, w9

        eor     w9, w13, w11

        orr     w0, w9, w8

        add     sp, sp, #128            // =128

        ret

It should be possible to avoid all these spills in the middle. The Rust code

also has a 16-element and a 64-element version; with 16 elements (#define LEN

16 in the C code) the spills do not happen, but only because there are enough

registers; with 64 elements (#define LEN 64 in the C code) the spills do not

happen because the code uses the NEON registers instead. GCC uses the NEON

registers on all three versions of the code.

The expected result would be to either vectorize also the 32-element and the

16-element versions (like both GCC and x86-64 clang do), or at least reorder

the operations in the fully unrolled loop so there are no unnecessary spills.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>