[llvm-bugs] [Bug 31933] New: AVX512: LLVM generates poor quality code involving masks

Fri Feb 10 12:36:27 PST 2017

https://llvm.org/bugs/show_bug.cgi?id=31933

            Bug ID: 31933
           Summary: AVX512: LLVM generates poor quality code involving
                    masks
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: wenzel.jakob at epfl.ch
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Clang/LLVM (trunk) seems to have issues generating code involving bitwise
arithmetic of AVX512-style mask registers with sizes other than 16 bits.
Consider the following two versions of a (somewhat contrived) program that
computes three masks and then ORs them together:

#include <immintrin.h>

/* Version 1: use _mm512_kor */
__m512d test1(__m512d a, __m512d b, __m512d c) {
    __mmask8 m1 = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
    __mmask8 m2 = _mm512_cmp_pd_mask(b, c, _CMP_LT_OQ);
    __mmask8 m3 = _mm512_cmp_pd_mask(c, a, _CMP_LT_OQ);

    __mmask8 m = _mm512_kor(_mm512_kor(m1, m2), m3);

    return _mm512_mask_blend_pd(m, a, b);
}

/* Version 2: use operator| */
__m512d test2(__m512d a, __m512d b, __m512d c) {
    __mmask8 m1 = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
    __mmask8 m2 = _mm512_cmp_pd_mask(b, c, _CMP_LT_OQ);
    __mmask8 m3 = _mm512_cmp_pd_mask(c, a, _CMP_LT_OQ);

    __mmask8 m = m1 | m2 | m3;

    return _mm512_mask_blend_pd(m, a, b);
}

Version 1 (with _mm512_kor) generates the following code (uh, oh, lots of e*x
<-> k* transitions, move with zero extend, etc.)

__Z5test1Dv8_dS_S_:
    vcmplt_oqpd    k0, zmm0, zmm1
    kmovw    eax, k0
    vcmplt_oqpd    k0, zmm1, zmm2
    kmovw    ecx, k0
    vcmplt_oqpd    k0, zmm2, zmm0
    kmovw    edx, k0
    movzx    eax, al
    movzx    ecx, cl
    kmovw    k0, ecx
    kmovw    k1, eax
    korw    k0, k1, k0
    movzx    eax, dl
    kmovw    k1, eax
    korw    k0, k0, k1
    kmovw    eax, k0
    kmovw    k1, eax
    vblendmpd    zmm0 {k1}, zmm0, zmm1
    ret

Version 2 is better but still not ideal due to the unnecessary transitions
between mask registers and regular registers.

__Z5test2Dv8_dS_S_:
    vcmplt_oqpd    k0, zmm0, zmm1
    kmovw    eax, k0
    vcmplt_oqpd    k0, zmm1, zmm2
    kmovw    ecx, k0
    vcmplt_oqpd    k0, zmm2, zmm0
    kmovw    edx, k0
    or    cl, al
    or    cl, dl
    kmovw    k1, ecx
    vblendmpd    zmm0 {k1}, zmm0, zmm1
    ret

What I would have expected to see is this, but it doesn't seem that Clang/LLVM
is able to generate it:

    vcmplt_oqpd    k0, zmm0, zmm1
    vcmplt_oqpd    k1, zmm1, zmm2
    vcmplt_oqpd    k2, zmm2, zmm0
    korw    k0, k0, k1
    korw    k0, k0, k2
    vblendmpd    zmm0 {k0}, zmm0, zmm1

I think that either version 1 or version 2 (or both) should. It would be great
if this could be fixed.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170210/9d9bf295/attachment.html>