[LLVMbugs] [Bug 21385] New: optimize reciprocals with fast-math (x86)

Sat Oct 25 15:57:27 PDT 2014

http://llvm.org/bugs/show_bug.cgi?id=21385

            Bug ID: 21385
           Summary: optimize reciprocals with fast-math (x86)
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: spatel+llvm at rotateright.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

This is the sibling to bug 20900 (square root estimates).

We could be generating RCPSS / RCPPS instead of divisions when -ffast-math
allows it.

I thought this would be simpler than square root, but we'll need to again
measure the performance gains per CPU to decide where to enable it, and we may
need to offer a 2nd Newton-Raphson refinement implementation for x86 that lacks
FMA.

I've also written a test program (below) that tries to confirm the goodness of
the estimate. It shows that one N-R step is inadequate to get us to 1 ULP of
the IEEE result on both Intel Sandy Bridge and AMD Jaguar. 

Most unfortunately, we get the wrong answer for a reciprocal of 1.0f! This is
why RCPPS isn't generated with gcc -ffast-math.

Both Intel and AMD manuals say that the estimate instruction provides almost
12-bits of accuracy, and one N-R step should double that accuracy to 24-bits
which is good enough for a single-precision float, but no...that "almost
12-bits" is definitely not good enough. We need two N-R steps.

Test program:

#include <xmmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
float recip_ieee(float x) { return 1.0f/x; }

float recip_est(float x) {
    float est;
    __m128 vecX = { x,0,0,0 };
    __m128 vecEst = _mm_rcp_ss(vecX);
        _mm_store_ss(&est, vecEst);    
    return est;     
}

float recip_est_NR1(float x) {
    float est;
    __m128 vecX = { x,0,0,0 };
    __m128 vecEst = _mm_rcp_ss(vecX);
        _mm_store_ss(&est, vecEst);    

    est = est + est * (1 - est * x);
    return est;     
}

float recip_est_NR2(float x) {
    float est;
    __m128 vecX = { x,0,0,0 };
    __m128 vecEst = _mm_rcp_ss(vecX);
        _mm_store_ss(&est, vecEst);    

    est = est + est * (1 - est * x);
    est = est + est * (1 - est * x);
    return est;     
}

int main() {
    int i;
    unsigned int tests = 0;
    unsigned int not_exact_NR1 = 0;
    unsigned int not_exact_NR2 = 0;
    unsigned int bad_NR1 = 0;
    unsigned int bad_NR2 = 0;
    // ignore denorm inputs (anything less than 0x00800000)
    // ignore denorm outputs (any input over 0x7e800000)
    // ignore inf (0x7f800000)
    // ignore nans (anything more than 0x7f800000)
    // ignore negative (anything more than 0x80000000)
    for (i = 0x00800000; i < 0x7e800000; i++) {
        float f, ieee, est, est_nr1, est_nr2;
        int ieee_int, est_int, est_nr1_int, est_nr2_int;

        tests++;
        memcpy(&f, &i, 4);
        ieee = recip_ieee(f);
//        est = recip_est(f);
        est_nr1 = recip_est_NR1(f);
        est_nr2 = recip_est_NR2(f);
        memcpy(&ieee_int, &ieee, 4);
//        memcpy(&est_int, &est, 4);
        memcpy(&est_nr1_int, &est_nr1, 4);
        memcpy(&est_nr2_int, &est_nr2, 4);

//        if (i % 0x01000000 == 0) printf("0x%08x\n", i); // just a status
print...

        if (ieee != est_nr1) {

            not_exact_NR1++;
            if (abs(ieee_int - est_nr1_int) > 1) {
                bad_NR1++;
//                printf("0x%08x: %e, ieee = %e (0x%08x), est_nr1 = %e
(0x%08x), est_nr2 = %e (0x%08x)\n",
//                     i, f, ieee, ieee_int, est_nr1, est_nr1_int, est_nr2,
est_nr2_int);
            }
        }
        if (ieee != est_nr2) {
            not_exact_NR2++;
            if (abs(ieee_int - est_nr2_int) > 1) {
                bad_NR2++;
            }
        }
    }
    printf("Total tests = %d\n", tests);
    printf("Inexact results with one N-R step = %d\n", not_exact_NR1++); 
    printf("Results off by > 1 ULP with one N-R step = %d\n", bad_NR1);
    printf("Inexact results with two N-R steps = %d\n", not_exact_NR2++); 
    printf("Results off by > 1 ULP with two N-R steps = %d\n", bad_NR2);
    return 0;
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141025/0d93727e/attachment.html>