[llvm-bugs] [Bug 27942] New: Matrix std::inner_product optimization issues
via llvm-bugs
llvm-bugs at lists.llvm.org
Mon May 30 16:35:54 PDT 2016
https://llvm.org/bugs/show_bug.cgi?id=27942
Bug ID: 27942
Summary: Matrix std::inner_product optimization issues
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: peet.kat at yandex.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Clang/LLVM generates inefficient code for the inner product of two matrices.
Reproduction code:
https://gist.github.com/cynecx/e1b44a67e74bac257ed9e632c982fc7f
The code clang generates:
===================================
Matrix<(unsigned short)3, (unsigned short)3, float> Matrix<(unsigned short)3,
(unsigned short)3, float>::operator*<(unsigned short)3>(Matrix<(unsigned
short)3, (unsigned short)3, float> const&): # @Matrix<(unsigned short)3,
(unsigned short)3, float> Matrix<(unsigned short)3, (unsigned short)3,
float>::operator*<(unsigned short)3>(Matrix<(unsigned short)3, (unsigned
short)3, float> const&)
push r15
push r14
push r13
push r12
push rbx
sub rsp, 160
mov r14, rdx
mov r13, rsi
mov r15, rdi
xorps xmm0, xmm0
movups xmmword ptr [r15 + 16], xmm0
movups xmmword ptr [r15], xmm0
mov dword ptr [r15 + 32], 0
lea r12, [rsp + 98]
xor ebx, ebx
.LBB1_1: #
%_ZN6MatrixILt3ELt3EfE14GetRowAccessorEt.exit1
mov qword ptr [rsp + 136], r13
mov qword ptr [rsp + 144], rbx
mov word ptr [rsp + 152], 0
mov qword ptr [rsp + 112], r13
mov qword ptr [rsp + 120], rbx
mov word ptr [rsp + 128], 3
mov qword ptr [rsp + 88], r14
mov word ptr [rsp + 96], 0
mov ax, word ptr [rsp + 86]
mov word ptr [r12 + 4], ax
mov eax, dword ptr [rsp + 82]
mov dword ptr [r12], eax
mov word ptr [rsp + 104], 0
mov rax, qword ptr [rsp + 104]
mov qword ptr [rsp + 64], rax
movups xmm0, xmmword ptr [rsp + 88]
movups xmmword ptr [rsp + 48], xmm0
mov rax, qword ptr [rsp + 128]
mov qword ptr [rsp + 40], rax
movups xmm0, xmmword ptr [rsp + 112]
movups xmmword ptr [rsp + 24], xmm0
mov rax, qword ptr [rsp + 152]
mov qword ptr [rsp + 16], rax
movups xmm0, xmmword ptr [rsp + 136]
movups xmmword ptr [rsp], xmm0
xorps xmm0, xmm0
call float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
movss dword ptr [r15 + 4*rbx], xmm0
mov qword ptr [rsp + 136], r13
mov qword ptr [rsp + 144], rbx
mov word ptr [rsp + 152], 0
mov qword ptr [rsp + 112], r13
mov qword ptr [rsp + 120], rbx
mov word ptr [rsp + 128], 3
mov qword ptr [rsp + 88], r14
mov word ptr [rsp + 96], 1
mov ax, word ptr [rsp + 86]
mov word ptr [r12 + 4], ax
mov eax, dword ptr [rsp + 82]
mov dword ptr [r12], eax
mov word ptr [rsp + 104], 0
mov rax, qword ptr [rsp + 104]
mov qword ptr [rsp + 64], rax
movups xmm0, xmmword ptr [rsp + 88]
movups xmmword ptr [rsp + 48], xmm0
mov rax, qword ptr [rsp + 128]
mov qword ptr [rsp + 40], rax
movups xmm0, xmmword ptr [rsp + 112]
movups xmmword ptr [rsp + 24], xmm0
mov rax, qword ptr [rsp + 152]
mov qword ptr [rsp + 16], rax
movups xmm0, xmmword ptr [rsp + 136]
movups xmmword ptr [rsp], xmm0
xorps xmm0, xmm0
call float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
movss dword ptr [r15 + 4*rbx + 4], xmm0
mov qword ptr [rsp + 136], r13
mov qword ptr [rsp + 144], rbx
mov word ptr [rsp + 152], 0
mov qword ptr [rsp + 112], r13
mov qword ptr [rsp + 120], rbx
mov word ptr [rsp + 128], 3
mov qword ptr [rsp + 88], r14
mov word ptr [rsp + 96], 2
mov ax, word ptr [rsp + 86]
mov word ptr [r12 + 4], ax
mov eax, dword ptr [rsp + 82]
mov dword ptr [r12], eax
mov word ptr [rsp + 104], 0
mov rax, qword ptr [rsp + 104]
mov qword ptr [rsp + 64], rax
movups xmm0, xmmword ptr [rsp + 88]
movups xmmword ptr [rsp + 48], xmm0
mov rax, qword ptr [rsp + 128]
mov qword ptr [rsp + 40], rax
movups xmm0, xmmword ptr [rsp + 112]
movups xmmword ptr [rsp + 24], xmm0
mov rax, qword ptr [rsp + 152]
mov qword ptr [rsp + 16], rax
movups xmm0, xmmword ptr [rsp + 136]
movups xmmword ptr [rsp], xmm0
xorps xmm0, xmm0
call float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
movss dword ptr [r15 + 4*rbx + 8], xmm0
add rbx, 3
cmp rbx, 9
jne .LBB1_1
mov rax, r15
add rsp, 160
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
float std::inner_product<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float): # @float std::inner_product<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>, Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float>(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3>,
Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixColumnAccessor<std::array<float, 9ul> const>, (unsigned short)3>,
float)
push rbx
lea rax, [rsp + 64]
lea rcx, [rsp + 16]
mov rbx, qword ptr [rsp + 16]
mov r11, qword ptr [rsp + 24]
mov rsi, qword ptr [rsp + 48]
movzx r9d, word ptr [rsp + 56]
movzx r8d, word ptr [rsp + 72]
mov dx, word ptr [rsp + 32]
movzx edi, dx
cmp rbx, qword ptr [rsp + 40]
je .LBB2_1
cmp r11, rsi
jne .LBB2_17
.LBB2_13: #
%_ZNK6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_17MatrixRowAccessorISt5arrayIfLm9EEEELt3EEneERKS6_.exit.thread.us14
movzx esi, dx
cmp esi, 2
ja .LBB2_23
movzx esi, word ptr [rax + 16]
cmp esi, 2
ja .LBB2_24
inc edx
mov word ptr [rcx + 16], dx
movzx edi, dx
cmp edi, 3
ja .LBB2_25
inc esi
mov word ptr [rax + 16], si
movzx esi, si
cmp esi, 4
jb .LBB2_13
jmp .LBB2_21
.LBB2_17: # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .thread]
movzx esi, dx
cmp esi, 3
jae .LBB2_23
movzx esi, word ptr [rax + 16]
cmp esi, 3
jae .LBB2_24
inc edx
mov word ptr [rcx + 16], dx
movzx edi, dx
cmp edi, 4
jae .LBB2_25
inc esi
mov word ptr [rax + 16], si
movzx esi, si
cmp esi, 4
jb .LBB2_17
jmp .LBB2_21
.LBB2_1: # %.split.us
cmp r11, rsi
jne .LBB2_8
mov r10, qword ptr [rsp + 64]
lea rsi, [rbx + 4*r11]
.LBB2_3: # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .us] [clone .us]
movzx edx, di
cmp r9d, edx
je .LBB2_22
movzx edx, di
cmp edx, 2
ja .LBB2_23
movzx edx, word ptr [rax + 16]
cmp rdx, 2
ja .LBB2_24
movss xmm1, dword ptr [rsi + 4*rdi] # xmm1 = mem[0],zero,zero,zero
lea rbx, [rdx + 2*rdx]
add rbx, r8
movss xmm2, dword ptr [r10 + 4*rbx] # xmm2 = mem[0],zero,zero,zero
lea ebx, [rdi + 1]
mov word ptr [rcx + 16], bx
movzx ebx, bx
cmp ebx, 3
ja .LBB2_25
mulss xmm1, xmm2
addss xmm0, xmm1
inc edx
mov word ptr [rax + 16], dx
movzx edx, dx
inc rdi
cmp edx, 4
jb .LBB2_3
jmp .LBB2_21
.LBB2_8: # %Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned
short)3, float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned
short)3>::operator!=(Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixAccessorIterator<Matrix<(unsigned short)3, (unsigned short)3,
float>::MatrixRowAccessor<std::array<float, 9ul> >, (unsigned short)3> const&)
const [clone .exit] [clone .thread] [clone .us]
movzx esi, dx
cmp esi, 2
ja .LBB2_23
movzx esi, word ptr [rax + 16]
cmp esi, 2
ja .LBB2_24
inc edx
mov word ptr [rcx + 16], dx
movzx edi, dx
cmp edi, 3
ja .LBB2_25
inc esi
mov word ptr [rax + 16], si
movzx esi, si
cmp esi, 4
jb .LBB2_8
.LBB2_21: # %.us-lcssa8.us
mov edi, .L.str.3
mov esi, .L.str.1
mov edx, 36
mov ecx,
.L__PRETTY_FUNCTION__._ZN6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_20MatrixColumnAccessorIKSt5arrayIfLm9EEEELt3EEppEv
call __assert_fail
.LBB2_22: # %.us-lcssa.us
pop rbx
ret
.LBB2_23: # %.us-lcssa5.us
mov edi, .L.str.2
mov esi, .L.str.1
mov edx, 87
mov ecx,
.L__PRETTY_FUNCTION__._ZNK6MatrixILt3ELt3EfE17MatrixRowAccessorISt5arrayIfLm9EEEixEt
call __assert_fail
.LBB2_24: # %.us-lcssa6.us
mov edi, .L.str
mov esi, .L.str.1
mov edx, 134
mov ecx,
.L__PRETTY_FUNCTION__._ZNK6MatrixILt3ELt3EfE20MatrixColumnAccessorIKSt5arrayIfLm9EEEixEt
call __assert_fail
.LBB2_25: # %.us-lcssa7.us
mov edi, .L.str.3
mov esi, .L.str.1
mov edx, 36
mov ecx,
.L__PRETTY_FUNCTION__._ZN6MatrixILt3ELt3EfE22MatrixAccessorIteratorINS0_17MatrixRowAccessorISt5arrayIfLm9EEEELt3EEppEv
call __assert_fail
===================================
The code g++ generates:
Matrix<(unsigned short)3, (unsigned short)3, float> Matrix<(unsigned short)3,
(unsigned short)3, float>::operator*<(unsigned short)3>(Matrix<(unsigned
short)3, (unsigned short)3, float> const&):
movss xmm8, DWORD PTR [rsi]
mov rax, rdi
movaps xmm5, xmm8
movss xmm2, DWORD PTR [rdx]
pxor xmm0, xmm0
mulss xmm5, xmm2
movss xmm7, DWORD PTR [rsi+4]
movaps xmm4, xmm7
movss xmm1, DWORD PTR [rdx+12]
movaps xmm11, xmm8
mulss xmm4, xmm1
movss xmm3, DWORD PTR [rsi+8]
addss xmm5, xmm0
movss xmm10, DWORD PTR [rdx+24]
movss xmm6, DWORD PTR [rdx+4]
mulss xmm11, xmm6
movaps xmm9, xmm7
movaps xmm14, xmm2
addss xmm5, xmm4
movaps xmm4, xmm3
movss xmm12, DWORD PTR [rsi+12]
mulss xmm4, xmm10
addss xmm11, xmm0
mulss xmm14, xmm12
movaps xmm13, xmm1
addss xmm4, xmm5
movss xmm5, DWORD PTR [rdx+16]
mulss xmm9, xmm5
addss xmm14, xmm0
movss DWORD PTR [rdi], xmm4
addss xmm11, xmm9
movaps xmm9, xmm3
movss xmm4, DWORD PTR [rdx+28]
mulss xmm9, xmm4
addss xmm9, xmm11
movss DWORD PTR [rdi+4], xmm9
movss xmm9, DWORD PTR [rdx+8]
mulss xmm8, xmm9
movaps xmm11, xmm8
movss xmm8, DWORD PTR [rdx+20]
mulss xmm7, xmm8
addss xmm11, xmm0
addss xmm11, xmm7
movss xmm7, DWORD PTR [rdx+32]
mulss xmm3, xmm7
addss xmm3, xmm11
movss xmm11, DWORD PTR [rsi+16]
mulss xmm13, xmm11
movss DWORD PTR [rdi+8], xmm3
addss xmm14, xmm13
movaps xmm13, xmm10
movss xmm3, DWORD PTR [rsi+20]
mulss xmm13, xmm3
addss xmm13, xmm14
movaps xmm14, xmm6
mulss xmm14, xmm12
mulss xmm12, xmm9
movss DWORD PTR [rdi+12], xmm13
movaps xmm13, xmm5
addss xmm14, xmm0
mulss xmm13, xmm11
addss xmm12, xmm0
mulss xmm11, xmm8
addss xmm14, xmm13
movaps xmm13, xmm4
addss xmm11, xmm12
mulss xmm13, xmm3
mulss xmm3, xmm7
addss xmm13, xmm14
addss xmm3, xmm11
movss DWORD PTR [rdi+16], xmm13
movss DWORD PTR [rdi+20], xmm3
movss xmm3, DWORD PTR [rsi+24]
mulss xmm2, xmm3
mulss xmm6, xmm3
mulss xmm3, xmm9
movaps xmm11, xmm2
movss xmm2, DWORD PTR [rsi+28]
addss xmm6, xmm0
mulss xmm1, xmm2
addss xmm11, xmm0
mulss xmm5, xmm2
addss xmm0, xmm3
mulss xmm2, xmm8
addss xmm11, xmm1
movss xmm1, DWORD PTR [rsi+32]
mulss xmm10, xmm1
addss xmm5, xmm6
mulss xmm4, xmm1
addss xmm2, xmm0
mulss xmm1, xmm7
addss xmm10, xmm11
addss xmm4, xmm5
addss xmm1, xmm2
movss DWORD PTR [rdi+24], xmm10
movss DWORD PTR [rdi+28], xmm4
movss DWORD PTR [rdi+32], xmm1
ret
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160530/5e1838af/attachment-0001.html>
More information about the llvm-bugs
mailing list