[llvm-bugs] [Bug 43539] New: #pragma clang loop vectorize(disable) should actually disable vectorization

Wed Oct 2 16:45:14 PDT 2019

https://bugs.llvm.org/show_bug.cgi?id=43539

            Bug ID: 43539
           Summary: #pragma clang loop vectorize(disable) should actually
                    disable vectorization
           Product: clang
           Version: trunk
          Hardware: Other
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: husseydevin at gmail.com
                CC: llvm-bugs at lists.llvm.org, neeilans at live.com,
                    richard-llvm at metafoo.co.uk

**#pragma clang loop vectorize(disable) is useless.**

It only works on tiny loops. Anything relatively complicated will be vectorized
regardless. 

(I'm guessing that the cost model overrides it):

#include <stdint.h>
#include <stddef.h>

uint64_t some_func(uint64_t *p, size_t len)
{
    uint64_t sum1 = 0, sum2 = 0;
    len &= ~(size_t)3; // always multiple of 4

#pragma clang loop vectorize(disable)
    for (size_t i = 0; i < len; i++) {
        sum1 += *p++ ^ 12345;
        sum2 += *p++ ^ 12345;
    }
    return sum1 ^ sum2;
}

I'd expect this for aarch64 with -O3 (actual output from
-march=armv8-a+nosimd):

some_func:
// %bb.0:
        mov     x8, xzr
        ands    x10, x1, #0xfffffffffffffffc
        b.eq    .LBB0_4
// %bb.1:
        mov     x9, xzr
        mov     x11, xzr
        mov     x12, xzr
        add     x13, x0, #16
        mov     w14, #12345
.LBB0_2:
        ldp     x15, x16, [x13, #-16]
        ldp     x17, x0, [x13], #32
        subs    x10, x10, #2
        eor     x15, x15, x14
        eor     x17, x17, x14
        eor     x16, x16, x14
        eor     x0, x0, x14
        add     x11, x15, x11
        add     x12, x17, x12
        add     x8, x16, x8
        add     x9, x0, x9
        b.ne    .LBB0_2
// %bb.3:
        add     x10, x12, x11
        add     x8, x9, x8
.LBB0_4:
        eor     x0, x8, x10
        ret

However, I get this, a clearly vectorized loop:

some_func:
// %bb.0:
        ands    x8, x1, #0xfffffffffffffffc
        b.eq    .LBB0_4
// %bb.1:
        mov     w10, #12345
        add     x9, x0, #16
        movi    v0.2d, #0000000000000000
        dup     v1.2d, x10
        movi    v2.2d, #0000000000000000
.LBB0_2:
        ldp     q3, q4, [x9, #-16]
        subs    x8, x8, #2
        add     x9, x9, #32
        eor     v3.16b, v3.16b, v1.16b
        eor     v4.16b, v4.16b, v1.16b
        add     v2.2d, v3.2d, v2.2d
        add     v0.2d, v4.2d, v0.2d
        b.ne    .LBB0_2
// %bb.3:
        add     v0.2d, v0.2d, v2.2d
        fmov    x8, d0
        mov     x9, v0.d[1]
        eor     x0, x9, x8
        ret
.LBB0_4:
        movi    v0.2d, #0000000000000000
        fmov    x8, d0
        mov     x9, v0.d[1]
        eor     x0, x9, x8
        ret

Similar things are output on SSE2, NEON32, etc.

#pragma clang loop vectorize(disable) needs to completely shut off
vectorization for the loop.

Need I say more?

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20191002/92741678/attachment.html>