<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - #pragma clang loop vectorize(disable) should actually disable vectorization"
href="https://bugs.llvm.org/show_bug.cgi?id=43539">43539</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>#pragma clang loop vectorize(disable) should actually disable vectorization
</td>
</tr>
<tr>
<th>Product</th>
<td>clang
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>Other
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>LLVM Codegen
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedclangbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>husseydevin@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk
</td>
</tr></table>
<p>
<div>
<pre>**#pragma clang loop vectorize(disable) is useless.**
It only works on tiny loops. Anything relatively complicated will be vectorized
regardless.
(I'm guessing that the cost model overrides it):
#include <stdint.h>
#include <stddef.h>
uint64_t some_func(uint64_t *p, size_t len)
{
uint64_t sum1 = 0, sum2 = 0;
len &= ~(size_t)3; // always multiple of 4
#pragma clang loop vectorize(disable)
for (size_t i = 0; i < len; i++) {
sum1 += *p++ ^ 12345;
sum2 += *p++ ^ 12345;
}
return sum1 ^ sum2;
}
I'd expect this for aarch64 with -O3 (actual output from
-march=armv8-a+nosimd):
some_func:
// %bb.0:
mov x8, xzr
ands x10, x1, #0xfffffffffffffffc
b.eq .LBB0_4
// %bb.1:
mov x9, xzr
mov x11, xzr
mov x12, xzr
add x13, x0, #16
mov w14, #12345
.LBB0_2:
ldp x15, x16, [x13, #-16]
ldp x17, x0, [x13], #32
subs x10, x10, #2
eor x15, x15, x14
eor x17, x17, x14
eor x16, x16, x14
eor x0, x0, x14
add x11, x15, x11
add x12, x17, x12
add x8, x16, x8
add x9, x0, x9
b.ne .LBB0_2
// %bb.3:
add x10, x12, x11
add x8, x9, x8
.LBB0_4:
eor x0, x8, x10
ret
However, I get this, a clearly vectorized loop:
some_func:
// %bb.0:
ands x8, x1, #0xfffffffffffffffc
b.eq .LBB0_4
// %bb.1:
mov w10, #12345
add x9, x0, #16
movi v0.2d, #0000000000000000
dup v1.2d, x10
movi v2.2d, #0000000000000000
.LBB0_2:
ldp q3, q4, [x9, #-16]
subs x8, x8, #2
add x9, x9, #32
eor v3.16b, v3.16b, v1.16b
eor v4.16b, v4.16b, v1.16b
add v2.2d, v3.2d, v2.2d
add v0.2d, v4.2d, v0.2d
b.ne .LBB0_2
// %bb.3:
add v0.2d, v0.2d, v2.2d
fmov x8, d0
mov x9, v0.d[1]
eor x0, x9, x8
ret
.LBB0_4:
movi v0.2d, #0000000000000000
fmov x8, d0
mov x9, v0.d[1]
eor x0, x9, x8
ret
Similar things are output on SSE2, NEON32, etc.
#pragma clang loop vectorize(disable) needs to completely shut off
vectorization for the loop.
Need I say more?</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>