<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/105807>105807</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Clang's "__builtin_reduce_min" algorithm for float vectors needs improvement when setting "-msse4.1".
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            clang
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          WiwilZ
      </td>
    </tr>
</table>

<pre>
    <https://godbolt.org/z/vW7j9Tene>

Take the f32x4 vector as an example,

"__builtin_reduce_min" uses "min" and "cmpunord" instructions 3 times,
```cpp
using f32x4 [[gnu::vector_size(16)]] = float;
float ReduceMin(f32x4 v) { 
 return __builtin_reduce_min(v); 
}
```
```asm
ReduceMin(float vector[4]):
        movaps  xmm1, xmm0
        movaps  xmm2, xmm0
 movshdup        xmm3, xmm0
        cmpunordss      xmm0, xmm0
 movaps  xmm4, xmm0
        andps   xmm4, xmm3
        minss   xmm3, xmm1
        shufps  xmm1, xmm1, 255
        movhlps xmm2, xmm2
 andnps  xmm0, xmm3
        orps    xmm0, xmm4
        movaps  xmm3, xmm2
        minss   xmm3, xmm0
        cmpunordss      xmm0, xmm0
 movaps  xmm4, xmm0
        andnps  xmm4, xmm3
        andps xmm0, xmm2
        orps    xmm0, xmm4
        movaps  xmm2, xmm1
 minss   xmm2, xmm0
        cmpunordss      xmm0, xmm0
 movaps  xmm3, xmm0
        andnps  xmm3, xmm2
        andps   xmm0, xmm1
        orps    xmm0, xmm3
        ret
```
but it can be reduced to 2 times.
```cpp
float MyReduceMin(__m128 a) {
    const __m128 v1 = _mm_movehdup_ps(a);  // {a[3], a[3], a[1], a[1]}
    // {, a[2] == NaN ? a[3] : min(a[2], a[3]), , a[0] == NaN ? a[1] : min(a[0], a[1])}
    const __m128 min64 = _mm_blendv_ps(_mm_min_ps(v1, a), v1, _mm_cmpunord_ps(a, a)); 
    const __m128 v2 = _mm_shuffle_ps(min64, min64, 2);  // {, , , min64[2]}
    // {, , , min64[0] == NaN ? min64[2] : min(min64[0], min64[2])}
    const __m128 min32 = _mm_blendv_ps(_mm_min_ss(v2, min64), v2,  _mm_cmpunord_ss(min64, min64)); 
    return _mm_cvtss_f32(min32);
}
```
```asm
MyReduceMin(float vector[4]):
        movshdup        xmm2, xmm0
        movaps  xmm1, xmm2
        minps   xmm1, xmm0
        xorps   xmm3, xmm3
 cmpunordps      xmm0, xmm3
        blendvps        xmm1, xmm2, xmm0
 movaps  xmm3, xmm1
        shufps  xmm3, xmm1, 2
        movaps xmm2, xmm3
        minss   xmm2, xmm1
        cmpunordss      xmm1, xmm1
        movaps  xmm0, xmm1
        blendvps        xmm2, xmm3, xmm0
        movaps  xmm0, xmm2
        ret
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0V02PozgQ_TXOxeoIypDAgUOns7nNHFYjjbQXZMAJnsU2wobpmV-_MoaE8LG9H5pWK2Dnufzq2X7lUK35TTKWoPCEwvOOtqZUTfKVf-fVH7tMFT8SRN5KY2qNyCuCC4LLTRWZqsxeNTcEl58ILt3X47f4C5MMkd-Qd0beq_v8Qv9k2JQMXwm8B7hjuVENphpTidk7FXXFELxNRyCANM1aXhku04YVbc5SwSUCwK1mGiOAoUllYVu5qFupmsJ2calN0-aGK6kxwYYLph_xD577z-va9bSay9tArU__dJOtzZK8Oqap5j8Zgsg_IIhReEbhGSNyxtdKUYPIyYXpW_j3nusnyy0askUQY3Q8YQfDDTNtI_F6epFFIzKC0fE8Yz1rUi1cz9O8PRPHHYWnwFK2UQdt8fAnVEdrjfG7ED6CN_v0NhEwQwjV6bJo6xH6LgRZDzKujNZ3pLcMNs4TrAehsrCAKYLMuHKp9TMR_xmhy_Y6y7d_Qhgu0i6rWk_ThgFBZSGHGN46D9X0RKeIYFNVMgu_ncsvElXOEGRN9klw-B_JwmxZJlnOd9d_ynJDqkmWG3JPNpe3vnVW8pwp1TCzelKz1mBucE4lzhh2R73ARmFwvrTfciV3iD_9mB7sNBU-RJgOjvJgkCupDR6-7vzenVIhUqE6Zk9pWmsEER28BTv_tiEoCk-k94c3PH_35--jGdkZHyFGCAy2aOf-TD9jRC73kBiRfr17Eg76PGNsm2OXtxHJX0byFnzjJ5pPwgguD8Fdm6xisuicMr1WXLpG57uIjpRrWcC4G-9ijqCJYS8XA-4TWv-5VswN77nYEPcXWK7OKMoDN2i3vRJz-JqW01ATPadDFjN-oCuBv9VV97rCJF-nbd_zLK5eU2cp8lhF7dDOaJ1eCbiBZBDy31bQ56P2z2vovAxueNlKtV26_mhDG_X4ffChiZWNNjTqVy-tcuZUbn1G3IzQh-a6XVPJU01dTX4yxXb1npeJ7VrgryMnrDfsfEWCB6-PVm-jFC4LwK5ISBGTmO5Y4h8hCLwwDsNdmRQHYEXus2OcHeKMsSg-HGIaZPkxgDD2gx1PwIPAi4B4MYn9cB9HWViEcVTEfuhfIUCBxwTl1b6qOmEv4DuudcsS3wsj77iraMYq3d_lAfKKyhsCe4p3TWIHvGTtTaPAq7g2-hHCcFOx5M3Bj_0de-sOTqubargpBb6qBk_PisaSsUJjLupGdUwwafD3kkmsmTH2no0AXoTWLNj7CGC_a5sqmf2u4KZss32uBIKLZTc8XupGfWO5QXDps9UILkPCXQJ_BQAA__8h04Y6">