[LLVMbugs] [Bug 21711] New: [X86][AVX] separate stores are not being merged into a single 256bit store.
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Tue Dec 2 10:41:39 PST 2014
http://llvm.org/bugs/show_bug.cgi?id=21711
Bug ID: 21711
Summary: [X86][AVX] separate stores are not being merged into a
single 256bit store.
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: andrea.dibiagio at gmail.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
This is similar to bug 21709.
However, bug 21709 is about 256bit loads. Thi one is related to store
instructions.
Here is the source code:
////// foo.c //////
#include <x86intrin.h>
void f1(__m128 v, float *ptr) {
ptr[0] = v[0];
ptr[1] = v[1];
ptr[2] = v[2];
ptr[3] = v[3];
}
void f2(__m256 v, float *ptr) {
ptr[0] = v[0];
ptr[1] = v[1];
ptr[2] = v[2];
ptr[3] = v[3];
ptr[4] = v[4];
ptr[5] = v[5];
ptr[6] = v[6];
ptr[7] = v[7];
}
void f3(__m256 v, __m128 *ptr) {
ptr[0] = (__m128) __builtin_shufflevector(v, v, 0, 1, 2, 3);
ptr[1] = (__m128) __builtin_shufflevector(v, v, 4, 5, 6, 7);
}
/////////////
$ clang foo.c -march=btver2 -O2 -S -o - -emit-llvm
Function 'f1' is correctly vectorized into:
define void @f1(<4 x float> %v, float* nocapture %ptr) {
entry:
%0 = bitcast float* %ptr to <4 x float>*
store <4 x float> %v, <4 x float>* %0, align 4, !tbaa !1
ret void
}
This helps the backend to efficiently generate a single vmovups for it.
(here is the codegen for 'f1' with `llc -mcpu=btver2`).
f1:
vmovups %xmm0, (%rdi)
retq
--
However, function 'f2' doesn't get vectorized and we end up with the following
IR:
define void @f2(<8 x float> %v, float* nocapture %ptr) {
entry:
%vecext = extractelement <8 x float> %v, i32 0
store float %vecext, float* %ptr, align 4, !tbaa !1
%vecext1 = extractelement <8 x float> %v, i32 1
%arrayidx2 = getelementptr inbounds float* %ptr, i64 1
store float %vecext1, float* %arrayidx2, align 4, !tbaa !1
%vecext3 = extractelement <8 x float> %v, i32 2
%arrayidx4 = getelementptr inbounds float* %ptr, i64 2
store float %vecext3, float* %arrayidx4, align 4, !tbaa !1
%vecext5 = extractelement <8 x float> %v, i32 3
%arrayidx6 = getelementptr inbounds float* %ptr, i64 3
store float %vecext5, float* %arrayidx6, align 4, !tbaa !1
%vecext7 = extractelement <8 x float> %v, i32 4
%arrayidx8 = getelementptr inbounds float* %ptr, i64 4
store float %vecext7, float* %arrayidx8, align 4, !tbaa !1
%vecext9 = extractelement <8 x float> %v, i32 5
%arrayidx10 = getelementptr inbounds float* %ptr, i64 5
store float %vecext9, float* %arrayidx10, align 4, !tbaa !1
%vecext11 = extractelement <8 x float> %v, i32 6
%arrayidx12 = getelementptr inbounds float* %ptr, i64 6
store float %vecext11, float* %arrayidx12, align 4, !tbaa !1
%vecext13 = extractelement <8 x float> %v, i32 7
%arrayidx14 = getelementptr inbounds float* %ptr, i64 7
store float %vecext13, float* %arrayidx14, align 4, !tbaa !1
ret void
}
The assembly generated for function 'f2' is therefore quite horrible:
f2:
vmovss %xmm0, (%rdi)
vextractps $1, %xmm0, 4(%rdi)
vextractps $2, %xmm0, 8(%rdi)
vextractps $3, %xmm0, 12(%rdi)
vextractf128 $1, %ymm0, %xmm0
vmovss %xmm0, 16(%rdi)
vextractps $1, %xmm0, 20(%rdi)
vextractps $2, %xmm0, 24(%rdi)
vextractps $3, %xmm0, 28(%rdi)
vzeroupper
retq
On AVX targets with feature FastUAMem and !SlowUAMem32, it should be instead:
vmovups %ymm0, (%rdi)
Function 'f3' is already vectorized and the LLVM IR looks much better than the
one from 'f2'.
define void @f3(<8 x float> %v, <4 x float>* nocapture %ptr) {
entry:
%shuffle = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0,
i32 1, i32 2, i32 3>
store <4 x float> %shuffle, <4 x float>* %ptr, align 16, !tbaa !5
%shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32
4, i32 5, i32 6, i32 7>
%arrayidx2 = getelementptr inbounds <4 x float>* %ptr, i64 1
store <4 x float> %shuffle1, <4 x float>* %arrayidx2, align 16, !tbaa !5
ret void
}
The compiler should however further vectorize that code coalescing the two
stores.
With -mcpu=btver2, llc generates the following assembly for function 'f3'.
f3:
vmovaps %xmm0, (%rdi)
vextractf128 $1, %ymm0, 16(%rdi)
vzeroupper
retq
Ideally, for btver2 (and other !SlowUAMem32 targets) we should get a 256-bit
store:
vmovups %ymm0, (%rdi)
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141202/3881ef3a/attachment.html>
More information about the llvm-bugs
mailing list