<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Missed optimisation - horizontal max for vectors is not optimised."

   href="https://llvm.org/bugs/show_bug.cgi?id=23116">23116</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Missed optimisation - horizontal max for vectors is not optimised.

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows NT

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>nick@indigorenderer.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Computing the horizontal max (or min etc..) of a vector is not optimised to

faster code.

For example, the C code:

#include <immintrin.h>

inline float max(float a, float b)

{

    return a > b ? a : b;

}

float findMax(__m256 v)

{

     return max(max(max(max(max(max(max(v[0], v[1]), v[2]), v[3]), v[4]),

v[5]), v[6]), v[7]);

}

Is compiled to by Clang 3.7/trunk to:

findMax(float __vector(8)):                        # @findMax(float

__vector(8))

    vmovshdup    %xmm0, %xmm1    # xmm1 = xmm0[1,1,3,3]

    vmaxss    %xmm1, %xmm0, %xmm1

    vpermilpd    $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0]

    vmaxss    %xmm2, %xmm1, %xmm1

    vpermilps    $231, %xmm0, %xmm2 # xmm2 = xmm0[3,1,2,3]

    vmaxss    %xmm2, %xmm1, %xmm1

    vextractf128    $1, %ymm0, %xmm0

    vmaxss    %xmm0, %xmm1, %xmm1

    vmovshdup    %xmm0, %xmm2    # xmm2 = xmm0[1,1,3,3]

    vmaxss    %xmm2, %xmm1, %xmm1

    vpermilpd    $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0]

    vmaxss    %xmm2, %xmm1, %xmm1

    vpermilps    $231, %xmm0, %xmm0 # xmm0 = xmm0[3,1,2,3]

    vmaxss    %xmm0, %xmm1, %xmm0

    vzeroupper

    retq

Which is basically 7 vmaxss's in serial (slow).

It could be optimised to something like:

float findMax(__m256 v)

{

    __m128 a = _mm256_extractf128_ps(v, 0);

    __m128 b = _mm256_extractf128_ps(v, 1);

    __m128 c = _mm_max_ps(a, b);

    return max(max(c[0], c[1]), max(c[2], c[3]));

}

Which compiles to:

findMax(float __vector(8)):                        # @findMax(float

__vector(8))

    vextractf128    $1, %ymm0, %xmm1

    vmaxps    %xmm1, %xmm0, %xmm0

    vmovshdup    %xmm0, %xmm1    # xmm1 = xmm0[1,1,3,3]

    vmaxss    %xmm1, %xmm0, %xmm1

    vpermilpd    $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0]

    vpermilps    $231, %xmm0, %xmm0 # xmm0 = xmm0[3,1,2,3]

    vmaxss    %xmm0, %xmm2, %xmm0

    vmaxss    %xmm0, %xmm1, %xmm0

    vzeroupper

    retq

See <a href="http://goo.gl/jM3KNz">http://goo.gl/jM3KNz</a> for the code.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>