<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [SSE] scalar intrinsics don't fold loads"

   href="https://llvm.org/bugs/show_bug.cgi?id=23349">23349</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[SSE] scalar intrinsics don't fold loads

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>spatel+llvm@rotateright.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>#include <xmmintrin.h>

__m128 add_fold(__m128 x, float *y) {

        __m128 ld = _mm_load_ss(y);

        return _mm_add_ss(x, ld);

}

or IR test case for llc:

define <4 x float> @add_fold(<4 x float> %x, float* %y) #0 {

  %0 = load float, float* %y, align 1, !tbaa !2

  %vecext1.i = extractelement <4 x float> %x, i32 0

  %add.i = fadd float %vecext1.i, %0

  %vecins.i = insertelement <4 x float> %x, float %add.i, i32 0

  ret <4 x float> %vecins.i

}

----------------------------------------------------------------------------

The load should be folded into the math op, but:

$ clang -O1 addfold.c -S -o -

...

    movss    (%rdi), %xmm1           ## xmm1 = mem[0],zero,zero,zero

    addss    %xmm1, %xmm0

----------------------------------------------------------------------------

The load does get folded with non-intrinsic C (IR won't have extract/insert in

this case) or with vector ops:

float add_fold2(float x, float *y) {

        return x + *y;

}

__m128 add_fold3(__m128 x, float *y) {

        __m128 ld = _mm_load_ps(y);

        return _mm_add_ps(x, ld);

}

...

    addss    (%rdi), %xmm0

...

    addps    (%rdi), %xmm0</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>