<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Vectorization of 8 byte load prevents load merging"
href="https://bugs.llvm.org/show_bug.cgi?id=42708">42708</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Vectorization of 8 byte load prevents load merging
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>8.0
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Scalar Optimizations
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>chfast@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>The following C++ load64le() function implements loading of an little-endian
uint64_t.
using uint64_t = unsigned long;
uint64_t load64le(const unsigned char *bytes) noexcept {
return uint64_t{bytes[0]} | (uint64_t{bytes[1]} << 8) |
(uint64_t{bytes[2]} << 16) | (uint64_t{bytes[3]} << 24) |
(uint64_t{bytes[4]} << 32) | (uint64_t{bytes[5]} << 40) |
(uint64_t{bytes[6]} << 48) | (uint64_t{bytes[7]} << 56);
}
Up to clang 8 this is properly optimized to single mov instruction.
But starting from clang 8 when AVX2 support is enabled (-mavx2),
the shift left in the code is vectorized and we get pretty crazy result:
-O2 -g0 -mavx2
.LCPI0_0:
.quad 8 # 0x8
.quad 16 # 0x10
.quad 24 # 0x18
.quad 32 # 0x20
load64le(unsigned char const*): # @load64le(unsigned
char const*)
# %bb.0:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 =
mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
# -- End function
When taking a look on LLVM IR, here is not optimized one:
-O2 -g0 -mavx2 -emit-llvm -Xclang -disable-llvm-optzns
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define dso_local i64 @_Z8load64lePKh(i8*) #0 {
%2 = alloca i8*, align 8
store i8* %0, i8** %2, align 8, !tbaa !2
%3 = load i8*, i8** %2, align 8, !tbaa !2
%4 = getelementptr inbounds i8, i8* %3, i64 0
%5 = load i8, i8* %4, align 1, !tbaa !6
%6 = zext i8 %5 to i64
%7 = load i8*, i8** %2, align 8, !tbaa !2
%8 = getelementptr inbounds i8, i8* %7, i64 1
%9 = load i8, i8* %8, align 1, !tbaa !6
%10 = zext i8 %9 to i64
%11 = shl i64 %10, 8
%12 = or i64 %6, %11
%13 = load i8*, i8** %2, align 8, !tbaa !2
%14 = getelementptr inbounds i8, i8* %13, i64 2
%15 = load i8, i8* %14, align 1, !tbaa !6
%16 = zext i8 %15 to i64
%17 = shl i64 %16, 16
%18 = or i64 %12, %17
%19 = load i8*, i8** %2, align 8, !tbaa !2
%20 = getelementptr inbounds i8, i8* %19, i64 3
%21 = load i8, i8* %20, align 1, !tbaa !6
%22 = zext i8 %21 to i64
%23 = shl i64 %22, 24
%24 = or i64 %18, %23
%25 = load i8*, i8** %2, align 8, !tbaa !2
%26 = getelementptr inbounds i8, i8* %25, i64 4
%27 = load i8, i8* %26, align 1, !tbaa !6
%28 = zext i8 %27 to i64
%29 = shl i64 %28, 32
%30 = or i64 %24, %29
%31 = load i8*, i8** %2, align 8, !tbaa !2
%32 = getelementptr inbounds i8, i8* %31, i64 5
%33 = load i8, i8* %32, align 1, !tbaa !6
%34 = zext i8 %33 to i64
%35 = shl i64 %34, 40
%36 = or i64 %30, %35
%37 = load i8*, i8** %2, align 8, !tbaa !2
%38 = getelementptr inbounds i8, i8* %37, i64 6
%39 = load i8, i8* %38, align 1, !tbaa !6
%40 = zext i8 %39 to i64
%41 = shl i64 %40, 48
%42 = or i64 %36, %41
%43 = load i8*, i8** %2, align 8, !tbaa !2
%44 = getelementptr inbounds i8, i8* %43, i64 7
%45 = load i8, i8* %44, align 1, !tbaa !6
%46 = zext i8 %45 to i64
%47 = shl i64 %46, 56
%48 = or i64 %42, %47
ret i64 %48
}
And the vectorized one:
-O2 -g0 -mavx2 -emit-llvm
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind readonly uwtable
define dso_local i64 @_Z8load64lePKh(i8* nocapture readonly) local_unnamed_addr
#0 {
%2 = load i8, i8* %0, align 1, !tbaa !2
%3 = zext i8 %2 to i64
%4 = getelementptr inbounds i8, i8* %0, i64 1
%5 = bitcast i8* %4 to <4 x i8>*
%6 = load <4 x i8>, <4 x i8>* %5, align 1, !tbaa !2
%7 = zext <4 x i8> %6 to <4 x i64>
%8 = shl nuw nsw <4 x i64> %7, <i64 8, i64 16, i64 24, i64 32>
%9 = getelementptr inbounds i8, i8* %0, i64 5
%10 = load i8, i8* %9, align 1, !tbaa !2
%11 = zext i8 %10 to i64
%12 = shl nuw nsw i64 %11, 40
%13 = getelementptr inbounds i8, i8* %0, i64 6
%14 = load i8, i8* %13, align 1, !tbaa !2
%15 = zext i8 %14 to i64
%16 = shl nuw nsw i64 %15, 48
%17 = getelementptr inbounds i8, i8* %0, i64 7
%18 = load i8, i8* %17, align 1, !tbaa !2
%19 = zext i8 %18 to i64
%20 = shl nuw i64 %19, 56
%21 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 2, i32 3,
i32 undef, i32 undef>
%22 = or <4 x i64> %8, %21
%23 = shufflevector <4 x i64> %22, <4 x i64> undef, <4 x i32> <i32 1, i32
undef, i32 undef, i32 undef>
%24 = or <4 x i64> %22, %23
%25 = extractelement <4 x i64> %24, i32 0
%26 = or i64 %25, %12
%27 = or i64 %26, %16
%28 = or i64 %27, %20
%29 = or i64 %28, %3
ret i64 %29
}
This indicates that the "load merge" is happening in CodeGen. Maybe the good
fix for the issue would be to merge loads in LLVM IR.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>