<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [x86] fmax/fmin calls should be optimized and inlined"

   href="https://llvm.org/bugs/show_bug.cgi?id=24475">24475</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>[x86] fmax/fmin calls should be optimized and inlined

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>All

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>spatel+llvm@rotateright.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This is a follow-on to <a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - [x86] fmax/fmin calls are not optimized with finite/fast math"

   href="show_bug.cgi?id=24314">bug 24314</a> (optimizing fmax/fmin with relaxed FP

constraints) and was originally suggested by Hal Finkel in

<a href="http://reviews.llvm.org/D11866">http://reviews.llvm.org/D11866</a>.

We should always be able to inline an optimized version of fmax/fmin - even

without the help of any FP relaxation settings.

x86 doesn't have HW min/max instructions that provide the behavior specified by

IEEE or the C standard with respect to NaN operands. Some architectures like

AArch64 have instructions that provide the exact behavior, so that should be an

easy match.

In reviewing the x86 C library implementations on Linux and Mac, I think we can

do better. Both of those use branches to handle the NaN cases. Here's a

branchless implementation with as few as 3 ops (cmpunordss / blendvps / maxss). 

I think this is compliant with the standards:

#include <immintrin.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#define HAS_SSE41 0

float branchless_fmax(float f1, float f2) {

        __m128 v1 = _mm_set_ss(f1);

        __m128 v2 = _mm_set_ss(f2);

        // Is the 1st arg a NaN?

        __m128 isnan1 = _mm_cmpunord_ss(v1, v1);

#if HAS_SSE41

        // If yes, replace it with the 2nd arg.

        __m128 nonan1 = _mm_blendv_ps(v1, v2, isnan1);

#else

         // A select (variable blend) the pre-SSE41 way.

        __m128 and = _mm_and_ps(isnan1, v2);

        __m128 andn = _mm_andnot_ps(isnan1, v1);

        __m128 nonan1 = _mm_or_ps(and, andn);

#endif

        // If the 1st arg is a NaN, this is a max of the 2nd arg against

itself.

        // If the 2nd arg is a NaN, this returns the 1st (if it wasn't a NaN).

        // If both args were NaN, this returns the NaN value of the 2nd arg

because

        // that's the 2nd parameter to the max and that's how x86 HW works.

        // If neither arg is a NaN, this is what we always wanted: the max of

the args!

        __m128 maxnum = _mm_max_ss(v2, nonan1);

        float result;

        _mm_store_ss(&result, maxnum);

        return result;

}

void fmax_compare(int i1, int i2) {

        float f1,f2;

        memcpy(&f1, &i1, 4);

        memcpy(&f2, &i2, 4);

        float max1 = fmaxf(f1, f2);

        float max2 = branchless_fmax(f1, f2);

        int m1, m2;

        memcpy(&m1, &max1, 4);

        memcpy(&m2, &max2, 4);

        if (m1 != m2)

                printf("%x %x: fmax = %f (%x), my_fmax = %f (%x)\n",

                        i1, i2, max1, m1, max2, m2);

}

int main() {

        int i1,i2;

        // both nan

        i1 = 0x7f800001;

        i2 = 0x7f800002;

        fmax_compare(i1, i2);

        fmax_compare(i2, i1);

        // both numbers

        i1 = 0x3f8f0000;

        i2 = 0x3f800000;

        fmax_compare(i1, i2);

        fmax_compare(i2, i1);

        // nan, number

        i1 = 0x7f800001;

        i2 = 0x3f800000;

        fmax_compare(i1, i2);

        // number, nan

        fmax_compare(i2, i1);

        return 0;

}</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>