<html>

    <head>

      <base href="http://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [X86][AVX] separate stores are not being merged into a single 256bit store."

   href="http://llvm.org/bugs/show_bug.cgi?id=21711">21711</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[X86][AVX] separate stores are not being merged into a single 256bit store.

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>andrea.dibiagio@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This is similar to <a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [X86][AVX] separate 2x128bit loads are not being merged into a single 256bit load."

   href="show_bug.cgi?id=21709">bug 21709</a>.

However, <a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [X86][AVX] separate 2x128bit loads are not being merged into a single 256bit load."

   href="show_bug.cgi?id=21709">bug 21709</a> is about 256bit loads. Thi one is related to store

instructions.

Here is the source code:

////// foo.c //////

#include <x86intrin.h>

void f1(__m128 v, float *ptr) {

  ptr[0] = v[0];

  ptr[1] = v[1];

  ptr[2] = v[2];

  ptr[3] = v[3];

}

void f2(__m256 v, float *ptr) {

  ptr[0] = v[0];

  ptr[1] = v[1];

  ptr[2] = v[2];

  ptr[3] = v[3];

  ptr[4] = v[4];

  ptr[5] = v[5];

  ptr[6] = v[6];

  ptr[7] = v[7];

}

void f3(__m256 v, __m128 *ptr) {

  ptr[0] = (__m128) __builtin_shufflevector(v, v, 0, 1, 2, 3);

  ptr[1] = (__m128) __builtin_shufflevector(v, v, 4, 5, 6, 7);

}

/////////////

$ clang foo.c -march=btver2 -O2 -S -o - -emit-llvm

Function 'f1' is correctly vectorized into:

define void @f1(<4 x float> %v, float* nocapture %ptr) {

entry:

  %0 = bitcast float* %ptr to <4 x float>*

  store <4 x float> %v, <4 x float>* %0, align 4, !tbaa !1

  ret void

}

This helps the backend to efficiently generate a single vmovups for it.

(here is the codegen for 'f1' with `llc -mcpu=btver2`).

f1:

        vmovups %xmm0, (%rdi)

        retq

--

However, function 'f2' doesn't get vectorized and we end up with the following

IR:

define void @f2(<8 x float> %v, float* nocapture %ptr) {

entry:

  %vecext = extractelement <8 x float> %v, i32 0

  store float %vecext, float* %ptr, align 4, !tbaa !1

  %vecext1 = extractelement <8 x float> %v, i32 1

  %arrayidx2 = getelementptr inbounds float* %ptr, i64 1

  store float %vecext1, float* %arrayidx2, align 4, !tbaa !1

  %vecext3 = extractelement <8 x float> %v, i32 2

  %arrayidx4 = getelementptr inbounds float* %ptr, i64 2

  store float %vecext3, float* %arrayidx4, align 4, !tbaa !1

  %vecext5 = extractelement <8 x float> %v, i32 3

  %arrayidx6 = getelementptr inbounds float* %ptr, i64 3

  store float %vecext5, float* %arrayidx6, align 4, !tbaa !1

  %vecext7 = extractelement <8 x float> %v, i32 4

  %arrayidx8 = getelementptr inbounds float* %ptr, i64 4

  store float %vecext7, float* %arrayidx8, align 4, !tbaa !1

  %vecext9 = extractelement <8 x float> %v, i32 5

  %arrayidx10 = getelementptr inbounds float* %ptr, i64 5

  store float %vecext9, float* %arrayidx10, align 4, !tbaa !1

  %vecext11 = extractelement <8 x float> %v, i32 6

  %arrayidx12 = getelementptr inbounds float* %ptr, i64 6

  store float %vecext11, float* %arrayidx12, align 4, !tbaa !1

  %vecext13 = extractelement <8 x float> %v, i32 7

  %arrayidx14 = getelementptr inbounds float* %ptr, i64 7

  store float %vecext13, float* %arrayidx14, align 4, !tbaa !1

  ret void

}

The assembly generated for function 'f2' is therefore quite horrible:

f2:

        vmovss  %xmm0, (%rdi)

        vextractps      $1, %xmm0, 4(%rdi)

        vextractps      $2, %xmm0, 8(%rdi)

        vextractps      $3, %xmm0, 12(%rdi)

        vextractf128    $1, %ymm0, %xmm0

        vmovss  %xmm0, 16(%rdi)

        vextractps      $1, %xmm0, 20(%rdi)

        vextractps      $2, %xmm0, 24(%rdi)

        vextractps      $3, %xmm0, 28(%rdi)

        vzeroupper

        retq

On AVX targets with feature FastUAMem and !SlowUAMem32, it should be instead:

        vmovups  %ymm0, (%rdi)

Function 'f3' is already vectorized and the LLVM IR looks much better than the

one from 'f2'.

define void @f3(<8 x float> %v, <4 x float>* nocapture %ptr) {

entry:

  %shuffle = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0,

i32 1, i32 2, i32 3>

  store <4 x float> %shuffle, <4 x float>* %ptr, align 16, !tbaa !5

  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32

4, i32 5, i32 6, i32 7>

  %arrayidx2 = getelementptr inbounds <4 x float>* %ptr, i64 1

  store <4 x float> %shuffle1, <4 x float>* %arrayidx2, align 16, !tbaa !5

  ret void

}

The compiler should however further vectorize that code coalescing the two

stores.

With -mcpu=btver2, llc generates the following assembly for function 'f3'.

f3:

        vmovaps %xmm0, (%rdi)

        vextractf128    $1, %ymm0, 16(%rdi)

        vzeroupper

        retq

Ideally, for btver2 (and other !SlowUAMem32 targets) we should get a 256-bit

store:

        vmovups  %ymm0, (%rdi)</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>