[LLVMdev] trunk's optimizer generates slower code than 3.5
Jack Howarth
howarth.mailing.lists at gmail.com
Sat Feb 14 08:19:52 PST 2015
Do any of the build-bots routinely run the SciMark v2.0 benchmark?
If so, might not an examination of those logs reveal the commit range
at which the optimizations in that benchmark degraded?
Jack
On Sat, Feb 14, 2015 at 11:13 AM, Jack Howarth
<howarth.mailing.lists at gmail.com> wrote:
> The regressions in the performance of generated code, introduced
> by the llvm 3.6 release, don't seem to be limited to this 8 queens
> puzzle" solver test case. See...
>
> http://www.phoronix.com/scan.php?page=article&item=llvm-clang-3.5-3.6-rc1&num=1
>
> where a bit hit in the performance of the Sparse Matrix Multiply test
> of the SciMark v2.0 benchmark was observed as well as others.
> Do you really want to release 3.6 with this level of performance regression?
> Jack
>
> On Fri, Feb 13, 2015 at 2:47 PM, Jack Howarth
> <howarth.mailing.lists at gmail.com> wrote:
>> Also confirmed with the llvm 3.5.1 release and the llvm 3.6 release
>> branch on x86_64-apple-darwin14...
>>
>> % clang-3.5 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> % time ./8 9
>> 352 solutions
>> 3.603u 0.002s 0:03.60 100.0% 0+0k 0+0io 2pf+0w
>> % time ./8 10
>> 724 solutions
>> 104.217u 0.059s 1:44.30 99.9% 0+0k 0+0io 2pf+0w
>>
>> % clang-3.6 -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>> -fno-exceptions -o 8 8.c
>> % time ./8 9
>> 352 solutions
>> 4.050u 0.001s 0:04.05 100.0% 0+0k 0+0io 2pf+0w
>> % time ./8 10
>> 724 solutions
>> 114.808u 0.041s 1:54.86 99.9% 0+0k 0+0io 2pf+0w
>>
>> On Fri, Feb 13, 2015 at 3:37 AM, 191919 <191919 at gmail.com> wrote:
>>> I submitted the problem report to clang's bugzilla but no one seems to
>>> care so I have to send it to the mailing list.
>>>
>>> clang 3.7 svn (trunk 229055 as the time I was to report this problem)
>>> generates slower code than 3.5 (Apple LLVM version 6.0
>>> (clang-600.0.56) (based on LLVM 3.5svn)) for the following code.
>>>
>>> It is a "8 queens puzzle" solver written as an educational example. As
>>> compiled by both clang 3.5 and 3.7, it gave the correct answer, but
>>> clang 3.5 generates code which runs 20% faster than 3.6/3.7.
>>>
>>> ##########################################
>>> # clang 3.5 which comes with Xcode 6.1.1
>>> ##########################################
>>> $ clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>>> -fno-exceptions -o 8 8.c
>>> $ time ./8 9 # 9 queens
>>> 352 solutions
>>> $ time ./8 10 # 10 queens
>>> ./8 9 1.63s user 0.00s system 99% cpu 1.632 total
>>> 724 solutions
>>> ./8 10 45.11s user 0.01s system 99% cpu 45.121 total
>>>
>>> ##########################################
>>> # clang 3.7 svn trunk
>>> ##########################################
>>> $ /opt/bin/clang -O3 -mssse3 -fomit-frame-pointer -fno-stack-protector
>>> -fno-exceptions -o 8 8.c
>>> $ time ./8 9 # 9 queens
>>> 352 solutions
>>> ./8 9 2.07s user 0.00s system 99% cpu 2.078 total
>>> $ time ./8 10 # 10 queens
>>> 724 solutions
>>> ./8 10 56.63s user 0.02s system 99% cpu 56.650 total
>>>
>>> The source code is below, I also attached the executable files as well
>>> as the assembly code files for clang 3.5 and 3.6 by IDA.
>>>
>>> The performance is even worse when compiling as 32-bit code while
>>> gcc-4.9.2 is not affected.
>>>
>>> ########## clang-3.5
>>> $ clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>>> -fno-exceptions -o 8 8.c
>>> $ time ./8 9
>>> 352 solutions
>>> ./8 9 1.95s user 0.00s system 99% cpu 1.950 total
>>>
>>> ########## clang-3.7
>>> $ /opt/bin/clang -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>>> -fno-exceptions -o 8 8.c
>>> $ time ./8 9
>>> 352 solutions
>>> ./8 9 2.48s user 0.00s system 99% cpu 2.480 total
>>>
>>> ######### gcc-4.9.2
>>> $ /opt/bin/gcc -m32 -O3 -fomit-frame-pointer -fno-stack-protector
>>> -fno-exceptions -o 8 8.c
>>> $ time ./8 9
>>> 352 solutions
>>> ./8 9 1.44s user 0.00s system 99% cpu 1.442 total
>>>
>>>
>>> ```
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>>
>>> static inline int validate(int* a, int d)
>>> {
>>> int i, j, x;
>>> for (i = 0; i < d; ++i)
>>> {
>>> for (j = i+1, x = 1; j < d; ++j, ++x)
>>> {
>>> const int d = a[i] - a[j];
>>> if (d == 0 || d == -x || d == x) return 0;
>>> }
>>> }
>>> return 1;
>>> }
>>>
>>> static inline int solve(int d)
>>> {
>>> int r = 0;
>>> int* a = (int*) calloc(sizeof(int), d+1);
>>> int p = d - 1;
>>>
>>> for (;;)
>>> {
>>> a[p]++;
>>>
>>> if (a[p] > d-1)
>>> {
>>> int bp = p - 1;
>>> while (bp >= 0)
>>> {
>>> a[bp]++;
>>> if (a[bp] <= d-1) break;
>>> a[bp] = 0;
>>> --bp;
>>> }
>>> if (bp < 0)
>>> break;
>>> a[p] = 0;
>>> }
>>> if (validate(a, d))
>>> {
>>> ++r;
>>> }
>>> }
>>>
>>> free(a);
>>> return r;
>>> }
>>>
>>> int main(int argc, char** argv)
>>> {
>>> if (argc != 2) return -1;
>>> int r = solve((int) strtol(argv[1], NULL, 10));
>>> printf("%d solutions\n", r);
>>> }
>>> ```
>>>
>>> clang 3.5's result:
>>>
>>> ```
>>> public _main
>>> _main proc near
>>>
>>> var_48 = qword ptr -48h
>>> var_40 = qword ptr -40h
>>> var_34 = dword ptr -34h
>>>
>>> push rbp
>>> push r15
>>> push r14
>>> push r13
>>> push r12
>>> push rbx
>>> sub rsp, 18h
>>> mov ebx, 0FFFFFFFFh
>>> cmp edi, 2
>>> jnz loc_100000F29
>>> mov rdi, [rsi+8] ; char *
>>> xor r14d, r14d
>>> xor esi, esi ; char **
>>> mov edx, 0Ah ; int
>>> call _strtol
>>> mov r15, rax
>>> shl rax, 20h
>>> mov rsi, offset __mh_execute_header
>>> add rsi, rax
>>> sar rsi, 20h ; size_t
>>> mov edi, 4 ; size_t
>>> call _calloc
>>> lea edx, [r15-1]
>>> movsxd r8, edx
>>> mov ecx, r15d
>>> add ecx, 0FFFFFFFEh
>>> js loc_100000DFA
>>> test r15d, r15d
>>> mov r11d, [rax+r8*4]
>>> jle loc_100000EAE
>>> mov ecx, r15d
>>> add ecx, 0FFFFFFFEh
>>> mov [rsp+48h+var_34], ecx
>>> movsxd rcx, ecx
>>> lea rcx, [rax+rcx*4]
>>> mov [rsp+48h+var_40], rcx
>>> lea rcx, [rax+4]
>>> mov [rsp+48h+var_48], rcx
>>> xor r14d, r14d
>>> jmp short loc_100000D33
>>> ; ---------------------------------------------------------------------------
>>> align 10h
>>>
>>> loc_100000D30: ; CODE XREF: _main+129 j
>>> ; _main+131 j ...
>>> add r14d, ebx
>>>
>>> loc_100000D33: ; CODE XREF: _main+92 j
>>> cmp r11d, edx
>>> lea edi, [r11+1]
>>> mov [rax+r8*4], edi
>>> mov rcx, [rsp+48h+var_40]
>>> mov esi, [rsp+48h+var_34]
>>> mov r11d, edi
>>> jl short loc_100000D84
>>> nop dword ptr [rax+00h]
>>>
>>> loc_100000D50: ; CODE XREF: _main+DA j
>>> mov edi, [rcx]
>>> lea ebp, [rdi+1]
>>> mov [rcx], ebp
>>> cmp edi, edx
>>> jl short loc_100000D71
>>> mov dword ptr [rcx], 0
>>> add rcx, 0FFFFFFFFFFFFFFFCh
>>> test esi, esi
>>> lea esi, [rsi-1]
>>> jg short loc_100000D50
>>> jmp loc_100000F0E
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000D71: ; CODE XREF: _main+C9 j
>>> test esi, esi
>>> js loc_100000F0E
>>> mov dword ptr [rax+r8*4], 0
>>> xor r11d, r11d
>>>
>>> loc_100000D84: ; CODE XREF: _main+BA j
>>> cmp r15d, 1
>>> mov esi, 0
>>> mov r9, [rsp+48h+var_48]
>>> mov r12d, 1
>>> jle short loc_100000DF0
>>>
>>> loc_100000D99: ; CODE XREF: _main+15E j
>>> mov r10d, [rax+rsi*4]
>>> mov ecx, 0FFFFFFFFh
>>> mov edi, 1
>>> mov r13, r9
>>> nop word ptr [rax+rax+00h]
>>>
>>> loc_100000DB0: ; CODE XREF: _main+14F j
>>> xor ebx, ebx
>>> mov ebp, r10d
>>> sub ebp, [r13+0]
>>> jz loc_100000D30
>>> cmp ecx, ebp
>>> jz loc_100000D30
>>> cmp edi, ebp
>>> jz loc_100000D30
>>> add r13, 4
>>> inc rdi
>>> dec ecx
>>> mov ebx, edi
>>> add ebx, esi
>>> cmp ebx, r15d
>>> jl short loc_100000DB0
>>> inc r12
>>> add r9, 4
>>> inc rsi
>>> cmp r12d, r15d
>>> jl short loc_100000D99
>>>
>>> loc_100000DF0: ; CODE XREF: _main+107 j
>>> mov ebx, 1
>>> jmp loc_100000D30
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000DFA: ; CODE XREF: _main+5E j
>>> mov ecx, [rax+r8*4]
>>> lea r9d, [rcx+1]
>>> mov [rax+r8*4], r9d
>>> cmp ecx, r8d
>>> jge loc_100000F0E
>>> lea r12, [rax+4]
>>> xor r14d, r14d
>>> db 2Eh
>>> nop word ptr [rax+rax+00000000h]
>>>
>>> loc_100000E20: ; CODE XREF: _main+216 j
>>> test r15d, r15d
>>> setle cl
>>> cmp r15d, 2
>>> jl short loc_100000E90
>>> test cl, cl
>>> mov r13d, 0
>>> mov r11, r12
>>> mov r10d, 1
>>> jnz short loc_100000E90
>>>
>>> loc_100000E3F: ; CODE XREF: _main+1F0 j
>>> mov edi, [rax+r13*4]
>>> mov edx, 0FFFFFFFFh
>>> mov ecx, 1
>>> mov rsi, r11
>>>
>>> loc_100000E50: ; CODE XREF: _main+1E1 j
>>> xor ebx, ebx
>>> mov ebp, edi
>>> sub ebp, [rsi]
>>> jz short loc_100000E95
>>> cmp edx, ebp
>>> jz short loc_100000E95
>>> cmp ecx, ebp
>>> jz short loc_100000E95
>>> add rsi, 4
>>> inc rcx
>>> dec edx
>>> mov ebx, ecx
>>> add ebx, r13d
>>> cmp ebx, r15d
>>> jl short loc_100000E50
>>> inc r10
>>> add r11, 4
>>> inc r13
>>> cmp r10d, r15d
>>> jl short loc_100000E3F
>>> db 66h, 66h, 66h, 66h, 2Eh
>>> nop word ptr [rax+rax+00000000h]
>>>
>>> loc_100000E90: ; CODE XREF: _main+19A j
>>> ; _main+1AD j
>>> mov ebx, 1
>>>
>>> loc_100000E95: ; CODE XREF: _main+1C6 j
>>> ; _main+1CA j ...
>>> add r14d, ebx
>>> cmp r9d, r8d
>>> lea ecx, [r9+1]
>>> mov [rax+r8*4], ecx
>>> mov r9d, ecx
>>> jl loc_100000E20
>>> jmp short loc_100000F0E
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000EAE: ; CODE XREF: _main+6B j
>>> add r15d, 0FFFFFFFEh
>>> movsxd rcx, r15d
>>> lea rcx, [rax+rcx*4]
>>> xor r14d, r14d
>>> jmp short loc_100000EC6
>>> ; ---------------------------------------------------------------------------
>>> align 20h
>>>
>>> loc_100000EC0: ; CODE XREF: _main+247 j
>>> ; _main+27C j
>>> inc r14d
>>> mov r11d, ebp
>>>
>>> loc_100000EC6: ; CODE XREF: _main+22C j
>>> lea ebp, [r11+1]
>>> mov [rax+r8*4], ebp
>>> cmp r11d, r8d
>>> mov rsi, rcx
>>> mov edi, r15d
>>> jl short loc_100000EC0
>>> nop dword ptr [rax+00000000h]
>>>
>>> loc_100000EE0: ; CODE XREF: _main+26A j
>>> mov ebp, [rsi]
>>> lea ebx, [rbp+1]
>>> mov [rsi], ebx
>>> cmp ebp, edx
>>> jl short loc_100000EFE
>>> mov dword ptr [rsi], 0
>>> add rsi, 0FFFFFFFFFFFFFFFCh
>>> test edi, edi
>>> lea edi, [rdi-1]
>>> jg short loc_100000EE0
>>> jmp short loc_100000F0E
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000EFE: ; CODE XREF: _main+259 j
>>> test edi, edi
>>> js short loc_100000F0E
>>> mov dword ptr [rax+r8*4], 0
>>> xor ebp, ebp
>>> jmp short loc_100000EC0
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000F0E: ; CODE XREF: _main+DC j
>>> ; _main+E3 j ...
>>> mov rdi, rax ; void *
>>> call _free
>>> lea rdi, aDSolutions ; "%d solutions\n"
>>> xor ebx, ebx
>>> xor eax, eax
>>> mov esi, r14d
>>> call _printf
>>>
>>> loc_100000F29: ; CODE XREF: _main+16 j
>>> mov eax, ebx
>>> add rsp, 18h
>>> pop rbx
>>> pop r12
>>> pop r13
>>> pop r14
>>> pop r15
>>> pop rbp
>>> retn
>>> _main endp
>>> ```
>>>
>>> clang 3.6's result:
>>>
>>> ```
>>> public _main
>>> _main proc near
>>>
>>> var_60 = qword ptr -60h
>>> var_58 = qword ptr -58h
>>> var_50 = qword ptr -50h
>>> var_48 = qword ptr -48h
>>> var_40 = qword ptr -40h
>>> var_38 = qword ptr -38h
>>>
>>> push rbp
>>> push r15
>>> push r14
>>> push r13
>>> push r12
>>> push rbx
>>> sub rsp, 38h
>>> mov ebx, 0FFFFFFFFh
>>> cmp edi, 2
>>> jnz loc_100000F23
>>> mov rbx, offset __mh_execute_header
>>> mov rdi, [rsi+8] ; char *
>>> xor r13d, r13d
>>> xor esi, esi ; char **
>>> mov edx, 0Ah ; int
>>> call _strtol
>>> mov r14, rax
>>> shl rax, 20h
>>> mov [rsp+68h+var_38], rax
>>> lea rsi, [rax+rbx]
>>> sar rsi, 20h ; size_t
>>> mov edi, 4 ; size_t
>>> call _calloc
>>> lea r11d, [r14-1]
>>> movsxd r12, r11d
>>> mov [rsp+68h+var_40], r12
>>> movsxd rcx, r14d
>>> mov [rsp+68h+var_50], rcx
>>> add ecx, 0FFFFFFFEh
>>> js loc_100000E1A
>>> mov ecx, r14d
>>> add ecx, 0FFFFFFFEh
>>> movsxd rcx, ecx
>>> inc rcx
>>> mov [rsp+68h+var_58], rcx
>>> mov rcx, rax
>>> add rcx, 4
>>> mov [rsp+68h+var_60], rcx
>>> xor ebp, ebp
>>> jmp short loc_100000D17
>>> ; ---------------------------------------------------------------------------
>>> align 10h
>>>
>>> loc_100000D10: ; CODE XREF: _main+15B j
>>> ; _main+163 j ...
>>> mov rbp, [rsp+68h+var_48]
>>> add ebp, edi
>>>
>>> loc_100000D17: ; CODE XREF: _main+93 j
>>> cmp r13d, r11d
>>> lea edx, [r13+1]
>>> mov [rax+r12*4], edx
>>> mov rcx, [rsp+68h+var_58]
>>> mov r13d, edx
>>> jl short loc_100000D6B
>>> nop dword ptr [rax+00h]
>>>
>>> loc_100000D30: ; CODE XREF: _main+DE j
>>> mov edx, [rax+rcx*4-4]
>>> lea esi, [rdx+1]
>>> mov [rax+rcx*4-4], esi
>>> cmp edx, r11d
>>> jl short loc_100000D60
>>> mov dword ptr [rax+rcx*4-4], 0
>>> dec rcx
>>> test rcx, rcx
>>> jg short loc_100000D30
>>> jmp loc_100000F09
>>> ; ---------------------------------------------------------------------------
>>> align 20h
>>>
>>> loc_100000D60: ; CODE XREF: _main+CE j
>>> mov dword ptr [rax+r12*4], 0
>>> xor r13d, r13d
>>>
>>> loc_100000D6B: ; CODE XREF: _main+BA j
>>> mov [rsp+68h+var_48], rbp
>>> test r14d, r14d
>>> setle cl
>>> mov rdx, offset __mh_execute_header
>>> lea rdx, [rdx+1]
>>> cmp [rsp+68h+var_38], rdx
>>> jl loc_100000E10
>>> test cl, cl
>>> mov edx, 0
>>> mov r10, [rsp+68h+var_60]
>>> mov r9d, 1
>>> jnz short loc_100000E10
>>>
>>> loc_100000DA3: ; CODE XREF: _main+195 j
>>> mov esi, [rax+rdx*4]
>>> mov r15d, 0FFFFFFFFh
>>> mov r8d, 1
>>> mov rcx, r10
>>> db 66h, 66h, 2Eh
>>> nop dword ptr [rax+rax+00000000h]
>>>
>>> loc_100000DC0: ; CODE XREF: _main+184 j
>>> mov ebx, [rcx]
>>> mov ebp, esi
>>> sub ebp, ebx
>>> xor edi, edi
>>> cmp r8d, ebp
>>> jz loc_100000D10
>>> cmp esi, ebx
>>> jz loc_100000D10
>>> cmp r15d, ebp
>>> jz loc_100000D10
>>> add rcx, 4
>>> inc r8
>>> dec r15d
>>> mov edi, r8d
>>> add edi, edx
>>> cmp edi, r14d
>>> jl short loc_100000DC0
>>> inc r9
>>> add r10, 4
>>> inc rdx
>>> cmp r9, [rsp+68h+var_50]
>>> jl short loc_100000DA3
>>> nop word ptr [rax+rax+00000000h]
>>>
>>> loc_100000E10: ; CODE XREF: _main+119 j
>>> ; _main+131 j
>>> mov edi, 1
>>> jmp loc_100000D10
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000E1A: ; CODE XREF: _main+6E j
>>> test r14d, r14d
>>> jle loc_100000F00
>>> mov dword ptr [rax+r12*4], 1
>>> xor ebp, ebp
>>> cmp r14d, 2
>>> jl loc_100000F09
>>> mov rcx, rax
>>> add rcx, 4
>>> mov [rsp+68h+var_48], rcx
>>> xor ebp, ebp
>>> mov r15d, 1
>>> nop dword ptr [rax+rax+00h]
>>>
>>> loc_100000E50: ; CODE XREF: _main+288 j
>>> mov rbx, rbp
>>> mov rcx, offset __mh_execute_header
>>> cmp [rsp+68h+var_38], rcx
>>> mov edx, 0
>>> mov r13, [rsp+68h+var_48]
>>> mov r8d, 1
>>> mov r9d, 1
>>> jle short loc_100000EE0
>>>
>>> loc_100000E7A: ; CODE XREF: _main+25A j
>>> mov r12d, [rax+rdx*4]
>>> mov edi, 0FFFFFFFFh
>>> mov ecx, 1
>>> mov rsi, r13
>>> nop dword ptr [rax+rax+00h]
>>>
>>> loc_100000E90: ; CODE XREF: _main+249 j
>>> mov r10d, [rsi]
>>> mov ebp, r12d
>>> sub ebp, r10d
>>> xor r9d, r9d
>>> cmp ecx, ebp
>>> jz short loc_100000EE0
>>> cmp r12d, r10d
>>> jz short loc_100000EE0
>>> cmp edi, ebp
>>> jz short loc_100000EE0
>>> add rsi, 4
>>> inc rcx
>>> dec edi
>>> mov ebp, ecx
>>> add ebp, edx
>>> cmp ebp, r14d
>>> jl short loc_100000E90
>>> inc r8
>>> add r13, 4
>>> inc rdx
>>> cmp r8, [rsp+68h+var_50]
>>> jl short loc_100000E7A
>>> mov r9d, 1
>>> db 66h, 66h, 66h, 66h, 2Eh
>>> nop word ptr [rax+rax+00000000h]
>>>
>>> loc_100000EE0: ; CODE XREF: _main+208 j
>>> ; _main+22E j ...
>>> mov rbp, rbx
>>> add ebp, r9d
>>> cmp r15d, r11d
>>> lea ecx, [r15+1]
>>> mov rdx, [rsp+68h+var_40]
>>> mov [rax+rdx*4], ecx
>>> mov r15d, ecx
>>> jl loc_100000E50
>>> jmp short loc_100000F09
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000F00: ; CODE XREF: _main+1AD j
>>> xor ebp, ebp
>>> test r11d, r11d
>>> cmovns ebp, r11d
>>>
>>> loc_100000F09: ; CODE XREF: _main+E0 j
>>> ; _main+1C1 j ...
>>> mov rdi, rax ; void *
>>> call _free
>>> lea rdi, aDSolutions ; "%d solutions\n"
>>> xor ebx, ebx
>>> xor eax, eax
>>> mov esi, ebp
>>> call _printf
>>>
>>> loc_100000F23: ; CODE XREF: _main+16 j
>>> mov eax, ebx
>>> add rsp, 38h
>>> pop rbx
>>> pop r12
>>> pop r13
>>> pop r14
>>> pop r15
>>> pop rbp
>>> retn
>>> _main endp
>>> ```
>>>
>>> gcc-4.9.2's result:
>>> ```
>>>
>>> _main proc near
>>>
>>> var_48 = qword ptr -48h
>>> var_40 = dword ptr -40h
>>> var_3C = dword ptr -3Ch
>>>
>>> cmp edi, 2
>>> jz short loc_100000D69
>>> or eax, 0FFFFFFFFh
>>> retn
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000D69: ; CODE XREF: _main+3 j
>>> push r15
>>> mov edx, 0Ah ; int
>>> push r14
>>> push r13
>>> push r12
>>> push rbp
>>> push rbx
>>> sub rsp, 18h
>>> mov rdi, [rsi+8] ; char *
>>> xor esi, esi ; char **
>>> call _strtol
>>> mov edi, 4 ; size_t
>>> lea esi, [rax+1]
>>> mov r14, rax
>>> mov ebx, eax
>>> lea r15d, [r14-2]
>>> movsxd rsi, esi ; size_t
>>> call _calloc
>>> mov [rsp+48h+var_3C], 0
>>> mov rdi, rax ; void *
>>> lea eax, [r14-1]
>>> cdqe
>>> lea r13, [rdi+rax*4]
>>> movsxd rax, r15d
>>> mov ebp, [r13+0]
>>> shl rax, 2
>>> lea r12, [rdi+rax]
>>> lea rax, [rdi+rax-4]
>>> mov [rsp+48h+var_48], rax
>>> mov eax, r14d
>>> lea r14d, [r14+1]
>>> nop word ptr [rax+rax+00h]
>>> nop word ptr [rax+rax+00h]
>>>
>>> loc_100000DE0: ; CODE XREF: _main+12B j
>>> ; _main+155 j ...
>>> add ebp, 1
>>> cmp ebx, ebp
>>> mov [r13+0], ebp
>>> jg short loc_100000E62
>>> test r15d, r15d
>>> js short loc_100000E33
>>> mov ecx, [r12]
>>> lea edx, [rcx+1]
>>> cmp ebx, edx
>>> mov [r12], edx
>>> jg short loc_100000E58
>>> mov r8, r12
>>> mov rcx, [rsp+48h+var_48]
>>> mov esi, r15d
>>> jmp short loc_100000E24
>>> ; ---------------------------------------------------------------------------
>>> align 10h
>>>
>>> loc_100000E10: ; CODE XREF: _main+D1 j
>>> mov edx, [rcx]
>>> sub r8, 4
>>> sub rcx, 4
>>> add edx, 1
>>> mov [rcx+4], edx
>>> cmp ebx, edx
>>> jg short loc_100000E58
>>>
>>> loc_100000E24: ; CODE XREF: _main+A9 j
>>> sub esi, 1
>>> mov dword ptr [r8], 0
>>> cmp esi, 0FFFFFFFFh
>>> jnz short loc_100000E10
>>>
>>> loc_100000E33: ; CODE XREF: _main+8E j
>>> call _free
>>> mov esi, [rsp+48h+var_3C]
>>> add rsp, 18h
>>> xor eax, eax
>>> pop rbx
>>> lea rdi, aDSolutions ; "%d solutions\n"
>>> pop rbp
>>> pop r12
>>> pop r13
>>> pop r14
>>> pop r15
>>> jmp _printf
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000E58: ; CODE XREF: _main+9D j
>>> ; _main+C2 j
>>> mov dword ptr [r13+0], 0
>>> xor ebp, ebp
>>>
>>> loc_100000E62: ; CODE XREF: _main+89 j
>>> test ebx, ebx
>>> jle loc_100000EE6
>>> lea r11, [rdi+8]
>>> xor r10d, r10d
>>>
>>> loc_100000E71: ; CODE XREF: _main+184 j
>>> add r10d, 1
>>> cmp r10d, eax
>>> jz short loc_100000EE6
>>> mov r8d, [r11-8]
>>> mov edx, r8d
>>> sub edx, [r11-4]
>>> add edx, 1
>>> cmp edx, 2
>>> jbe loc_100000DE0
>>> mov r9d, r14d
>>> mov rcx, r11
>>> mov edx, 1
>>> mov [rsp+48h+var_40], r10d
>>> sub r9d, r10d
>>> jmp short loc_100000ED3
>>> ; ---------------------------------------------------------------------------
>>> align 10h
>>>
>>> loc_100000EB0: ; CODE XREF: _main+179 j
>>> mov esi, r8d
>>> sub esi, [rcx]
>>> jz loc_100000DE0
>>> mov r10d, esi
>>> add rcx, 4
>>> add r10d, edx
>>> jz loc_100000DE0
>>> cmp esi, edx
>>> jz loc_100000DE0
>>>
>>> loc_100000ED3: ; CODE XREF: _main+144 j
>>> add edx, 1
>>> cmp edx, r9d
>>> jnz short loc_100000EB0
>>> mov r10d, [rsp+48h+var_40]
>>> add r11, 4
>>> jmp short loc_100000E71
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_100000EE6: ; CODE XREF: _main+104 j
>>> ; _main+118 j
>>> add [rsp+48h+var_3C], 1
>>> jmp loc_100000DE0
>>> _main endp
>>> ```
>>>
>>> MSVC 10.0's result:
>>>
>>> ```
>>>
>>> _main proc near ; CODE XREF: ___tmainCRTStartup+106 p
>>>
>>> var_80 = dword ptr -80h
>>> var_7C = dword ptr -7Ch
>>> var_78 = dword ptr -78h
>>> var_74 = dword ptr -74h
>>> var_70 = dword ptr -70h
>>> var_6C = dword ptr -6Ch
>>> var_68 = dword ptr -68h
>>> var_64 = dword ptr -64h
>>> var_60 = dword ptr -60h
>>> var_5C = dword ptr -5Ch
>>> argc = dword ptr 8
>>> argv = dword ptr 0Ch
>>> envp = dword ptr 10h
>>>
>>> push ebp
>>> mov ebp, esp
>>> and esp, 0FFFFFF80h
>>> push esi
>>> push edi
>>> push ebx
>>> sub esp, 74h
>>> push 3
>>> call sub_4080F0
>>> add esp, 4
>>> stmxcsr [esp+80h+var_80]
>>> or [esp+80h+var_80], 8000h
>>> ldmxcsr [esp+80h+var_80]
>>> cmp [ebp+argc], 2
>>> jz short loc_40103A
>>> mov eax, 0FFFFFFFFh
>>> add esp, 74h
>>> pop ebx
>>> pop edi
>>> pop esi
>>> mov esp, ebp
>>> pop ebp
>>> retn
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_40103A: ; CODE XREF: _main+29 j
>>> call ds:GetTickCount
>>> mov esi, eax
>>> mov eax, [ebp+argv]
>>> push dword ptr [eax+4] ; char *
>>> call _atoi
>>> mov edi, eax
>>> lea eax, [edi+1]
>>> push eax ; size_t
>>> push 4 ; size_t
>>> call _calloc
>>> add esp, 0Ch
>>> mov ecx, [eax+edi*4-4]
>>> lea edx, [edi-1]
>>> mov [esp+80h+var_6C], ecx
>>> xor ebx, ebx
>>> mov [esp+80h+var_7C], ebx
>>> lea ecx, [eax+edi*4]
>>> mov [esp+80h+var_74], ecx
>>> lea ecx, [edi-2]
>>> mov [esp+80h+var_70], ecx
>>> mov [esp+80h+var_60], edx
>>> mov [esp+80h+var_80], esi
>>> mov ecx, [esp+80h+var_6C]
>>>
>>> loc_401087: ; CODE XREF: _main+142 j
>>> ; _main+193 j
>>> mov edx, [esp+80h+var_60]
>>> inc ecx
>>> mov [eax+edi*4-4], ecx
>>> cmp edi, [eax+edx*4]
>>> jg short loc_4010DC
>>> mov esi, [esp+80h+var_70]
>>> test esi, esi
>>> js short loc_4010CE
>>> xor edx, edx
>>> mov [esp+80h+var_78], eax
>>> xor ebx, ebx
>>> mov eax, [esp+80h+var_74]
>>>
>>> loc_4010A9: ; CODE XREF: _main+C8 j
>>> mov ecx, [eax+ebx*4-8]
>>> inc ecx
>>> cmp ecx, edi
>>> jl loc_40117A
>>> inc edx
>>> lea esi, [ebx+edi-3]
>>> mov dword ptr [eax+ebx*4-8], 0
>>> dec ebx
>>> cmp edx, [esp+80h+var_60]
>>> jb short loc_4010A9
>>> mov eax, [esp+80h+var_78]
>>>
>>> loc_4010CE: ; CODE XREF: _main+9B j
>>> ; _main+186 j
>>> test esi, esi
>>> jl short loc_401147
>>> mov dword ptr [eax+edi*4-4], 0
>>> xor ecx, ecx
>>>
>>> loc_4010DC: ; CODE XREF: _main+93 j
>>> test edi, edi
>>> jle short loc_40113E
>>> mov [esp+80h+var_6C], ecx
>>> xor edx, edx
>>> mov [esp+80h+var_5C], edi
>>>
>>> loc_4010EA: ; CODE XREF: _main+132 j
>>> lea ecx, [edx+1]
>>> mov ebx, ecx
>>> mov esi, ebx
>>> cmp ecx, [esp+80h+var_5C]
>>> jge short loc_401130
>>> mov edx, [eax+edx*4]
>>> mov edi, 1
>>> mov [esp+80h+var_64], esi
>>> mov [esp+80h+var_68], ecx
>>>
>>> loc_401107: ; CODE XREF: _main+122 j
>>> mov esi, [eax+ebx*4]
>>> cmp edx, esi
>>> jz short loc_40118B
>>> sub esi, edx
>>> mov ecx, esi
>>> neg ecx
>>> cmp edi, ecx
>>> jz short loc_40118B
>>> cmp esi, edi
>>> jz short loc_40118B
>>> inc ebx
>>> inc edi
>>> cmp ebx, [esp+80h+var_5C]
>>> jl short loc_401107
>>> mov ecx, [esp+80h+var_68]
>>> mov esi, [esp+80h+var_64]
>>> cmp ecx, [esp+80h+var_5C]
>>>
>>> loc_401130: ; CODE XREF: _main+F5 j
>>> mov edx, esi
>>> jl short loc_4010EA
>>> xchg ax, ax
>>> mov ecx, [esp+80h+var_6C]
>>> mov edi, [esp+80h+var_5C]
>>>
>>> loc_40113E: ; CODE XREF: _main+DE j
>>> inc [esp+80h+var_7C]
>>> jmp loc_401087
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_401147: ; CODE XREF: _main+D0 j
>>> mov ebx, [esp+80h+var_7C]
>>> mov esi, [esp+80h+var_80]
>>> push eax ; void *
>>> call _free
>>> add esp, 4
>>> call ds:GetTickCount
>>> sub eax, esi
>>> push eax
>>> push ebx
>>> push offset aDSolutionsInDM ; "%d solutions in %d msecs.\n"
>>> call _printf
>>> xor eax, eax
>>> add esp, 80h
>>> pop ebx
>>> pop edi
>>> pop esi
>>> mov esp, ebp
>>> pop ebp
>>> retn
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_40117A: ; CODE XREF: _main+B0 j
>>> mov edx, [esp+80h+var_74]
>>> mov eax, [esp+80h+var_78]
>>> mov [edx+ebx*4-8], ecx
>>> jmp loc_4010CE
>>> ; ---------------------------------------------------------------------------
>>>
>>> loc_40118B: ; CODE XREF: _main+10C j
>>> ; _main+116 j ...
>>> mov ecx, [esp+80h+var_6C]
>>> mov edi, [esp+80h+var_5C]
>>> jmp loc_401087
>>> _main endp
>>> ```
>>> _______________________________________________
>>> LLVM Developers mailing list
>>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
More information about the llvm-dev
mailing list