<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Missed SLP vectorization with umin/umax"
href="https://bugs.llvm.org/show_bug.cgi?id=46968">46968</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Missed SLP vectorization with umin/umax
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Common Code Generator Code
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>david.bolvansky@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Hot code - LightPixel - from Firefox (rasterflood-svg benchmark could be
faster):
#include<stdint.h>
#include<stddef.h>
const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_B = 0;
const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_G = 1;
const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_R = 2;
const ptrdiff_t B8G8R8A8_COMPONENT_BYTEOFFSET_A = 3;
static const int sInputIntPrecisionBits = 15;
static const int sOutputIntPrecisionBits = 15;
static const int sCacheIndexPrecisionBits = 7;
static inline unsigned umax(unsigned a, unsigned b) {
return a > b ? a : b;
}
static inline unsigned umin(unsigned a, unsigned b) {
return a > b ? b : a;
}
void foo(uint8_t components[4], uint32_t specularNHi, uint32_t aColor) {
components[B8G8R8A8_COMPONENT_BYTEOFFSET_B] =
umin((specularNHi * components[B8G8R8A8_COMPONENT_BYTEOFFSET_B]) >>
sOutputIntPrecisionBits,
255U);
components[B8G8R8A8_COMPONENT_BYTEOFFSET_G] =
umin((specularNHi *components[B8G8R8A8_COMPONENT_BYTEOFFSET_G]) >>
sOutputIntPrecisionBits,
255U);
components[B8G8R8A8_COMPONENT_BYTEOFFSET_R] =
umin((specularNHi * components[B8G8R8A8_COMPONENT_BYTEOFFSET_R]) >>
sOutputIntPrecisionBits,
255U);
components[B8G8R8A8_COMPONENT_BYTEOFFSET_A] =
umax(components[B8G8R8A8_COMPONENT_BYTEOFFSET_B],
umax(components[B8G8R8A8_COMPONENT_BYTEOFFSET_G],
components[B8G8R8A8_COMPONENT_BYTEOFFSET_R]));
}
-O3 -mavx2 - We got: List vectorization was possible but not beneficial with
cost 0 >= 0
foo(unsigned char*, unsigned int, unsigned int): #
@foo(unsigned char*, unsigned int, unsigned int)
movzx eax, byte ptr [rdi]
imul eax, esi
shr eax, 15
cmp eax, 255
mov r8d, 255
cmovae eax, r8d
mov byte ptr [rdi], al
movzx edx, byte ptr [rdi + 1]
imul edx, esi
shr edx, 15
cmp edx, 255
cmovae edx, r8d
mov byte ptr [rdi + 1], dl
movzx ecx, byte ptr [rdi + 2]
imul ecx, esi
shr ecx, 15
cmp ecx, 255
cmovae ecx, r8d
mov byte ptr [rdi + 2], cl
cmp edx, ecx
cmova ecx, edx
cmp eax, ecx
cmova ecx, eax
mov byte ptr [rdi + 3], cl
ret
-O3 -mavx512f - Clang partially vectorizes it:
.LCPI0_0:
.long 255 # 0xff
foo(unsigned char*, unsigned int, unsigned int): #
@foo(unsigned char*, unsigned int, unsigned int)
movzx eax, byte ptr [rdi]
imul eax, esi
shr eax, 15
cmp eax, 255
mov ecx, 255
cmovb ecx, eax
mov byte ptr [rdi], cl
movzx eax, word ptr [rdi + 1]
vmovd xmm0, eax
vpmovzxbd xmm0, xmm0 # xmm0 =
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
vmovd xmm1, esi
vpbroadcastd xmm1, xmm1
vpmulld xmm0, xmm1, xmm0
vpbroadcastd xmm1, dword ptr [rip + .LCPI0_0] # xmm1 =
[255,255,255,255]
vpsrld xmm0, xmm0, 15
vpminud xmm0, xmm0, xmm1
vmovd eax, xmm0
mov byte ptr [rdi + 1], al
vpextrd edx, xmm0, 1
mov byte ptr [rdi + 2], dl
cmp eax, edx
cmova edx, eax
cmp ecx, edx
cmova edx, ecx
mov byte ptr [rdi + 3], dl
ret
Godbolt: -Rpass-missed=vec*</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>