[llvm-bugs] [Bug 27942] New: Matrix std::inner_product optimization issues

via llvm-bugs llvm-bugs at lists.llvm.org
Mon May 30 16:35:54 PDT 2016


https://llvm.org/bugs/show_bug.cgi?id=27942

            Bug ID: 27942
           Summary: Matrix std::inner_product optimization issues
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: peet.kat at yandex.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Clang/LLVM generates inefficient code for the inner product of two matrices.

Reproduction code:
https://gist.github.com/cynecx/e1b44a67e74bac257ed9e632c982fc7f

The code clang generates:

===================================

Matrix<(unsigned short)3, (unsigned short)3, float> Matrix<(unsigned short)3,
(unsigned short)3, float>::operator*<(unsigned short)3>(Matrix<(unsigned
short)3, (unsigned short)3, float> const&): # @Matrix<(unsigned short)3,
(unsigned short)3, float> Matrix<(unsigned short)3, (unsigned short)3,
float>::operator*<(unsigned short)3>(Matrix<(unsigned short)3, (unsigned
short)3, float> const&)
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        sub     rsp, 160
        mov     r14, rdx
        mov     r13, rsi
        mov     r15, rdi
        xorps   xmm0, xmm0
        movups  xmmword ptr [r15 + 16], xmm0
        movups  xmmword ptr [r15], xmm0
        mov     dword ptr [r15 + 32], 0
        lea     r12, [rsp + 98]
        xor     ebx, ebx
.LBB1_1:                                #
%_ZN6MatrixILt3ELt3EfE14GetRowAccessorEt.exit1
        mov     qword ptr [rsp + 136], r13
        mov     qword ptr [rsp + 144], rbx
        mov     word ptr [rsp + 152], 0
        mov     qword ptr [rsp + 112], r13
        mov     qword ptr [rsp + 120], rbx
        mov     word ptr [rsp + 128], 3
        mov     qword ptr [rsp + 88], r14
        mov     word ptr [rsp + 96], 0
        mov     ax, word ptr [rsp + 86]
        mov     word ptr [r12 + 4], ax
        mov     eax, dword ptr [rsp + 82]
        mov     dword ptr [r12], eax
        mov     word ptr [rsp + 104], 0
        mov     rax, qword ptr [rsp + 104]
        mov     qword ptr [rsp + 64], rax
        movups  xmm0, xmmword ptr [rsp + 88]
        movups  xmmword ptr [rsp + 48], xmm0
        mov     rax, qword ptr [rsp + 128]
        mov     qword ptr [rsp + 40], rax
        movups  xmm0, xmmword ptr [rsp + 112]
        movups  xmmword ptr [rsp + 24], xmm0
        mov     rax, qword ptr [rsp + 152]
        mov     qword ptr [rsp + 16], rax
        movups  xmm0, xmmword ptr [rsp + 136]
        movups  xmmword ptr [rsp], xmm0
        xorps   xmm0, xmm0
        call    float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
        movss   dword ptr [r15 + 4*rbx], xmm0
        mov     qword ptr [rsp + 136], r13
        mov     qword ptr [rsp + 144], rbx
        mov     word ptr [rsp + 152], 0
        mov     qword ptr [rsp + 112], r13
        mov     qword ptr [rsp + 120], rbx
        mov     word ptr [rsp + 128], 3
        mov     qword ptr [rsp + 88], r14
        mov     word ptr [rsp + 96], 1
        mov     ax, word ptr [rsp + 86]
        mov     word ptr [r12 + 4], ax
        mov     eax, dword ptr [rsp + 82]
        mov     dword ptr [r12], eax
        mov     word ptr [rsp + 104], 0
        mov     rax, qword ptr [rsp + 104]
        mov     qword ptr [rsp + 64], rax
        movups  xmm0, xmmword ptr [rsp + 88]
        movups  xmmword ptr [rsp + 48], xmm0
        mov     rax, qword ptr [rsp + 128]
        mov     qword ptr [rsp + 40], rax
        movups  xmm0, xmmword ptr [rsp + 112]
        movups  xmmword ptr [rsp + 24], xmm0
        mov     rax, qword ptr [rsp + 152]
        mov     qword ptr [rsp + 16], rax
        movups  xmm0, xmmword ptr [rsp + 136]
        movups  xmmword ptr [rsp], xmm0
        xorps   xmm0, xmm0
        call    float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
        movss   dword ptr [r15 + 4*rbx + 4], xmm0
        mov     qword ptr [rsp + 136], r13
        mov     qword ptr [rsp + 144], rbx
        mov     word ptr [rsp + 152], 0
        mov     qword ptr [rsp + 112], r13
        mov     qword ptr [rsp + 120], rbx
        mov     word ptr [rsp + 128], 3
        mov     qword ptr [rsp + 88], r14
        mov     word ptr [rsp + 96], 2
        mov     ax, word ptr [rsp + 86]
        mov     word ptr [r12 + 4], ax
        mov     eax, dword ptr [rsp + 82]
        mov     dword ptr [r12], eax
        mov     word ptr [rsp + 104], 0
        mov     rax, qword ptr [rsp + 104]
        mov     qword ptr [rsp + 64], rax
        movups  xmm0, xmmword ptr [rsp + 88]
        movups  xmmword ptr [rsp + 48], xmm0
        mov     rax, qword ptr [rsp + 128]
        mov     qword ptr [rsp + 40], rax
        movups  xmm0, xmmword ptr [rsp + 112]
        movups  xmmword ptr [rsp + 24], xmm0
        mov     rax, qword ptr [rsp + 152]
        mov     qword ptr [rsp + 16], rax
        movups  xmm0, xmmword ptr [rsp + 136]
        movups  xmmword ptr [rsp], xmm0
        xorps   xmm0, xmm0
        call    float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
        movss   dword ptr [r15 + 4*rbx + 8], xmm0
        add     rbx, 3
        cmp     rbx, 9
        jne     .LBB1_1
        mov     rax, r15
        add     rsp, 160
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret

float std::inner_product<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float): # @float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
        push    rbx
        lea     rax, [rsp + 64]
        lea     rcx, [rsp + 16]
        mov     rbx, qword ptr [rsp + 16]
        mov     r11, qword ptr [rsp + 24]
        mov     rsi, qword ptr [rsp + 48]
        movzx   r9d, word ptr [rsp + 56]
        movzx   r8d, word ptr [rsp + 72]
        mov     dx, word ptr [rsp + 32]
        movzx   edi, dx
        cmp     rbx, qword ptr [rsp + 40]
        je      .LBB2_1
        cmp     r11, rsi
        jne     .LBB2_17
.LBB2_13:                               #
%_ZNK6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_17MatrixRowAccessorISt5arrayIfLm9EEEELt3EEneERKS6_.exit.thread.us14
        movzx   esi, dx
        cmp     esi, 2
        ja      .LBB2_23
        movzx   esi, word ptr [rax + 16]
        cmp     esi, 2
        ja      .LBB2_24
        inc     edx
        mov     word ptr [rcx + 16], dx
        movzx   edi, dx
        cmp     edi, 3
        ja      .LBB2_25
        inc     esi
        mov     word ptr [rax + 16], si
        movzx   esi, si
        cmp     esi, 4
        jb      .LBB2_13
        jmp     .LBB2_21
.LBB2_17:                               # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .thread]
        movzx   esi, dx
        cmp     esi, 3
        jae     .LBB2_23
        movzx   esi, word ptr [rax + 16]
        cmp     esi, 3
        jae     .LBB2_24
        inc     edx
        mov     word ptr [rcx + 16], dx
        movzx   edi, dx
        cmp     edi, 4
        jae     .LBB2_25
        inc     esi
        mov     word ptr [rax + 16], si
        movzx   esi, si
        cmp     esi, 4
        jb      .LBB2_17
        jmp     .LBB2_21
.LBB2_1:                                # %.split.us
        cmp     r11, rsi
        jne     .LBB2_8
        mov     r10, qword ptr [rsp + 64]
        lea     rsi, [rbx + 4*r11]
.LBB2_3:                                # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .us] [clone .us]
        movzx   edx, di
        cmp     r9d, edx
        je      .LBB2_22
        movzx   edx, di
        cmp     edx, 2
        ja      .LBB2_23
        movzx   edx, word ptr [rax + 16]
        cmp     rdx, 2
        ja      .LBB2_24
        movss   xmm1, dword ptr [rsi + 4*rdi] # xmm1 = mem[0],zero,zero,zero
        lea     rbx, [rdx + 2*rdx]
        add     rbx, r8
        movss   xmm2, dword ptr [r10 + 4*rbx] # xmm2 = mem[0],zero,zero,zero
        lea     ebx, [rdi + 1]
        mov     word ptr [rcx + 16], bx
        movzx   ebx, bx
        cmp     ebx, 3
        ja      .LBB2_25
        mulss   xmm1, xmm2
        addss   xmm0, xmm1
        inc     edx
        mov     word ptr [rax + 16], dx
        movzx   edx, dx
        inc     rdi
        cmp     edx, 4
        jb      .LBB2_3
        jmp     .LBB2_21
.LBB2_8:                                # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .thread] [clone .us]
        movzx   esi, dx
        cmp     esi, 2
        ja      .LBB2_23
        movzx   esi, word ptr [rax + 16]
        cmp     esi, 2
        ja      .LBB2_24
        inc     edx
        mov     word ptr [rcx + 16], dx
        movzx   edi, dx
        cmp     edi, 3
        ja      .LBB2_25
        inc     esi
        mov     word ptr [rax + 16], si
        movzx   esi, si
        cmp     esi, 4
        jb      .LBB2_8
.LBB2_21:                               # %.us-lcssa8.us
        mov     edi, .L.str.3
        mov     esi, .L.str.1
        mov     edx, 36
        mov     ecx,
.L__PRETTY_FUNCTION__._ZN6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_20MatrixColumnAccessorIKSt5arrayIfLm9EEEELt3EEppEv
        call    __assert_fail
.LBB2_22:                               # %.us-lcssa.us
        pop     rbx
        ret
.LBB2_23:                               # %.us-lcssa5.us
        mov     edi, .L.str.2
        mov     esi, .L.str.1
        mov     edx, 87
        mov     ecx,
.L__PRETTY_FUNCTION__._ZNK6MatrixILt3ELt3EfE17MatrixRowAccessorISt5arrayIfLm9EEEixEt
        call    __assert_fail
.LBB2_24:                               # %.us-lcssa6.us
        mov     edi, .L.str
        mov     esi, .L.str.1
        mov     edx, 134
        mov     ecx,
.L__PRETTY_FUNCTION__._ZNK6MatrixILt3ELt3EfE20MatrixColumnAccessorIKSt5arrayIfLm9EEEixEt
        call    __assert_fail
.LBB2_25:                               # %.us-lcssa7.us
        mov     edi, .L.str.3
        mov     esi, .L.str.1
        mov     edx, 36
        mov     ecx,
.L__PRETTY_FUNCTION__._ZN6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_17MatrixRowAccessorISt5arrayIfLm9EEEELt3EEppEv
        call    __assert_fail

===================================

The code g++ generates:

Matrix<(unsigned short)3, (unsigned short)3, float> Matrix<(unsigned short)3,
(unsigned short)3, float>::operator*<(unsigned short)3>(Matrix<(unsigned
short)3, (unsigned short)3, float> const&):
        movss   xmm8, DWORD PTR [rsi]
        mov     rax, rdi
        movaps  xmm5, xmm8
        movss   xmm2, DWORD PTR [rdx]
        pxor    xmm0, xmm0
        mulss   xmm5, xmm2
        movss   xmm7, DWORD PTR [rsi+4]
        movaps  xmm4, xmm7
        movss   xmm1, DWORD PTR [rdx+12]
        movaps  xmm11, xmm8
        mulss   xmm4, xmm1
        movss   xmm3, DWORD PTR [rsi+8]
        addss   xmm5, xmm0
        movss   xmm10, DWORD PTR [rdx+24]
        movss   xmm6, DWORD PTR [rdx+4]
        mulss   xmm11, xmm6
        movaps  xmm9, xmm7
        movaps  xmm14, xmm2
        addss   xmm5, xmm4
        movaps  xmm4, xmm3
        movss   xmm12, DWORD PTR [rsi+12]
        mulss   xmm4, xmm10
        addss   xmm11, xmm0
        mulss   xmm14, xmm12
        movaps  xmm13, xmm1
        addss   xmm4, xmm5
        movss   xmm5, DWORD PTR [rdx+16]
        mulss   xmm9, xmm5
        addss   xmm14, xmm0
        movss   DWORD PTR [rdi], xmm4
        addss   xmm11, xmm9
        movaps  xmm9, xmm3
        movss   xmm4, DWORD PTR [rdx+28]
        mulss   xmm9, xmm4
        addss   xmm9, xmm11
        movss   DWORD PTR [rdi+4], xmm9
        movss   xmm9, DWORD PTR [rdx+8]
        mulss   xmm8, xmm9
        movaps  xmm11, xmm8
        movss   xmm8, DWORD PTR [rdx+20]
        mulss   xmm7, xmm8
        addss   xmm11, xmm0
        addss   xmm11, xmm7
        movss   xmm7, DWORD PTR [rdx+32]
        mulss   xmm3, xmm7
        addss   xmm3, xmm11
        movss   xmm11, DWORD PTR [rsi+16]
        mulss   xmm13, xmm11
        movss   DWORD PTR [rdi+8], xmm3
        addss   xmm14, xmm13
        movaps  xmm13, xmm10
        movss   xmm3, DWORD PTR [rsi+20]
        mulss   xmm13, xmm3
        addss   xmm13, xmm14
        movaps  xmm14, xmm6
        mulss   xmm14, xmm12
        mulss   xmm12, xmm9
        movss   DWORD PTR [rdi+12], xmm13
        movaps  xmm13, xmm5
        addss   xmm14, xmm0
        mulss   xmm13, xmm11
        addss   xmm12, xmm0
        mulss   xmm11, xmm8
        addss   xmm14, xmm13
        movaps  xmm13, xmm4
        addss   xmm11, xmm12
        mulss   xmm13, xmm3
        mulss   xmm3, xmm7
        addss   xmm13, xmm14
        addss   xmm3, xmm11
        movss   DWORD PTR [rdi+16], xmm13
        movss   DWORD PTR [rdi+20], xmm3
        movss   xmm3, DWORD PTR [rsi+24]
        mulss   xmm2, xmm3
        mulss   xmm6, xmm3
        mulss   xmm3, xmm9
        movaps  xmm11, xmm2
        movss   xmm2, DWORD PTR [rsi+28]
        addss   xmm6, xmm0
        mulss   xmm1, xmm2
        addss   xmm11, xmm0
        mulss   xmm5, xmm2
        addss   xmm0, xmm3
        mulss   xmm2, xmm8
        addss   xmm11, xmm1
        movss   xmm1, DWORD PTR [rsi+32]
        mulss   xmm10, xmm1
        addss   xmm5, xmm6
        mulss   xmm4, xmm1
        addss   xmm2, xmm0
        mulss   xmm1, xmm7
        addss   xmm10, xmm11
        addss   xmm4, xmm5
        addss   xmm1, xmm2
        movss   DWORD PTR [rdi+24], xmm10
        movss   DWORD PTR [rdi+28], xmm4
        movss   DWORD PTR [rdi+32], xmm1
        ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160530/5e1838af/attachment-0001.html>


More information about the llvm-bugs mailing list