[LLVMbugs] [Bug 21711] New: [X86][AVX] separate stores are not being merged into a single 256bit store.

Tue Dec 2 10:41:39 PST 2014

http://llvm.org/bugs/show_bug.cgi?id=21711

            Bug ID: 21711
           Summary: [X86][AVX] separate stores are not being merged into a
                    single 256bit store.
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: andrea.dibiagio at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

This is similar to bug 21709.
However, bug 21709 is about 256bit loads. Thi one is related to store
instructions.

Here is the source code:

////// foo.c //////
#include <x86intrin.h>

void f1(__m128 v, float *ptr) {
  ptr[0] = v[0];
  ptr[1] = v[1];
  ptr[2] = v[2];
  ptr[3] = v[3];
}

void f2(__m256 v, float *ptr) {
  ptr[0] = v[0];
  ptr[1] = v[1];
  ptr[2] = v[2];
  ptr[3] = v[3];
  ptr[4] = v[4];
  ptr[5] = v[5];
  ptr[6] = v[6];
  ptr[7] = v[7];
}

void f3(__m256 v, __m128 *ptr) {
  ptr[0] = (__m128) __builtin_shufflevector(v, v, 0, 1, 2, 3);
  ptr[1] = (__m128) __builtin_shufflevector(v, v, 4, 5, 6, 7);
}
/////////////

$ clang foo.c -march=btver2 -O2 -S -o - -emit-llvm

Function 'f1' is correctly vectorized into:

define void @f1(<4 x float> %v, float* nocapture %ptr) {
entry:
  %0 = bitcast float* %ptr to <4 x float>*
  store <4 x float> %v, <4 x float>* %0, align 4, !tbaa !1
  ret void
}

This helps the backend to efficiently generate a single vmovups for it.

(here is the codegen for 'f1' with `llc -mcpu=btver2`).

f1:
        vmovups %xmm0, (%rdi)
        retq

--

However, function 'f2' doesn't get vectorized and we end up with the following
IR:

define void @f2(<8 x float> %v, float* nocapture %ptr) {
entry:
  %vecext = extractelement <8 x float> %v, i32 0
  store float %vecext, float* %ptr, align 4, !tbaa !1
  %vecext1 = extractelement <8 x float> %v, i32 1
  %arrayidx2 = getelementptr inbounds float* %ptr, i64 1
  store float %vecext1, float* %arrayidx2, align 4, !tbaa !1
  %vecext3 = extractelement <8 x float> %v, i32 2
  %arrayidx4 = getelementptr inbounds float* %ptr, i64 2
  store float %vecext3, float* %arrayidx4, align 4, !tbaa !1
  %vecext5 = extractelement <8 x float> %v, i32 3
  %arrayidx6 = getelementptr inbounds float* %ptr, i64 3
  store float %vecext5, float* %arrayidx6, align 4, !tbaa !1
  %vecext7 = extractelement <8 x float> %v, i32 4
  %arrayidx8 = getelementptr inbounds float* %ptr, i64 4
  store float %vecext7, float* %arrayidx8, align 4, !tbaa !1
  %vecext9 = extractelement <8 x float> %v, i32 5
  %arrayidx10 = getelementptr inbounds float* %ptr, i64 5
  store float %vecext9, float* %arrayidx10, align 4, !tbaa !1
  %vecext11 = extractelement <8 x float> %v, i32 6
  %arrayidx12 = getelementptr inbounds float* %ptr, i64 6
  store float %vecext11, float* %arrayidx12, align 4, !tbaa !1
  %vecext13 = extractelement <8 x float> %v, i32 7
  %arrayidx14 = getelementptr inbounds float* %ptr, i64 7
  store float %vecext13, float* %arrayidx14, align 4, !tbaa !1
  ret void
}

The assembly generated for function 'f2' is therefore quite horrible:

f2:
        vmovss  %xmm0, (%rdi)
        vextractps      $1, %xmm0, 4(%rdi)
        vextractps      $2, %xmm0, 8(%rdi)
        vextractps      $3, %xmm0, 12(%rdi)
        vextractf128    $1, %ymm0, %xmm0
        vmovss  %xmm0, 16(%rdi)
        vextractps      $1, %xmm0, 20(%rdi)
        vextractps      $2, %xmm0, 24(%rdi)
        vextractps      $3, %xmm0, 28(%rdi)
        vzeroupper
        retq

On AVX targets with feature FastUAMem and !SlowUAMem32, it should be instead:
        vmovups  %ymm0, (%rdi)

Function 'f3' is already vectorized and the LLVM IR looks much better than the
one from 'f2'.

define void @f3(<8 x float> %v, <4 x float>* nocapture %ptr) {
entry:
  %shuffle = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0,
i32 1, i32 2, i32 3>
  store <4 x float> %shuffle, <4 x float>* %ptr, align 16, !tbaa !5
  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32
4, i32 5, i32 6, i32 7>
  %arrayidx2 = getelementptr inbounds <4 x float>* %ptr, i64 1
  store <4 x float> %shuffle1, <4 x float>* %arrayidx2, align 16, !tbaa !5
  ret void
}

The compiler should however further vectorize that code coalescing the two
stores.

With -mcpu=btver2, llc generates the following assembly for function 'f3'.

f3:
        vmovaps %xmm0, (%rdi)
        vextractf128    $1, %ymm0, 16(%rdi)
        vzeroupper
        retq

Ideally, for btver2 (and other !SlowUAMem32 targets) we should get a 256-bit
store:
        vmovups  %ymm0, (%rdi)

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141202/3881ef3a/attachment.html>