<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - #pragma clang loop vectorize(disable) should actually disable vectorization"

   href="https://bugs.llvm.org/show_bug.cgi?id=43539">43539</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>#pragma clang loop vectorize(disable) should actually disable vectorization

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>clang

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>Other

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>LLVM Codegen

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>husseydevin@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk

          </td>

        </tr></table>

      <p>

        <div>

        <pre>**#pragma clang loop vectorize(disable) is useless.**

It only works on tiny loops. Anything relatively complicated will be vectorized

regardless. 

(I'm guessing that the cost model overrides it):

#include <stdint.h>

#include <stddef.h>

uint64_t some_func(uint64_t *p, size_t len)

{

    uint64_t sum1 = 0, sum2 = 0;

    len &= ~(size_t)3; // always multiple of 4

#pragma clang loop vectorize(disable)

    for (size_t i = 0; i < len; i++) {

        sum1 += *p++ ^ 12345;

        sum2 += *p++ ^ 12345;

    }

    return sum1 ^ sum2;

}

I'd expect this for aarch64 with -O3 (actual output from

-march=armv8-a+nosimd):

some_func:

// %bb.0:

        mov     x8, xzr

        ands    x10, x1, #0xfffffffffffffffc

        b.eq    .LBB0_4

// %bb.1:

        mov     x9, xzr

        mov     x11, xzr

        mov     x12, xzr

        add     x13, x0, #16

        mov     w14, #12345

.LBB0_2:

        ldp     x15, x16, [x13, #-16]

        ldp     x17, x0, [x13], #32

        subs    x10, x10, #2

        eor     x15, x15, x14

        eor     x17, x17, x14

        eor     x16, x16, x14

        eor     x0, x0, x14

        add     x11, x15, x11

        add     x12, x17, x12

        add     x8, x16, x8

        add     x9, x0, x9

        b.ne    .LBB0_2

// %bb.3:

        add     x10, x12, x11

        add     x8, x9, x8

.LBB0_4:

        eor     x0, x8, x10

        ret

However, I get this, a clearly vectorized loop:

some_func:

// %bb.0:

        ands    x8, x1, #0xfffffffffffffffc

        b.eq    .LBB0_4

// %bb.1:

        mov     w10, #12345

        add     x9, x0, #16

        movi    v0.2d, #0000000000000000

        dup     v1.2d, x10

        movi    v2.2d, #0000000000000000

.LBB0_2:

        ldp     q3, q4, [x9, #-16]

        subs    x8, x8, #2

        add     x9, x9, #32

        eor     v3.16b, v3.16b, v1.16b

        eor     v4.16b, v4.16b, v1.16b

        add     v2.2d, v3.2d, v2.2d

        add     v0.2d, v4.2d, v0.2d

        b.ne    .LBB0_2

// %bb.3:

        add     v0.2d, v0.2d, v2.2d

        fmov    x8, d0

        mov     x9, v0.d[1]

        eor     x0, x9, x8

        ret

.LBB0_4:

        movi    v0.2d, #0000000000000000

        fmov    x8, d0

        mov     x9, v0.d[1]

        eor     x0, x9, x8

        ret

Similar things are output on SSE2, NEON32, etc.

#pragma clang loop vectorize(disable) needs to completely shut off

vectorization for the loop.

Need I say more?</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>