[PATCH] D42607: [LoopStrengthReduce, x86] don't add cost for a cmp that will be macro-fused (PR35681)
Sanjay Patel via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 30 12:17:26 PST 2018
spatel added a comment.
Thanks @evstupac and @venkataramanan.kumar.llvm for the feedback. I micro-benchmarked the maxArray code in its -O2 unrolled form from PR28343 (see below for object dump with address offsets), but as expected, I can't measure any perf difference on Haswell.
It does shrink the loop from 134 bytes to 97 bytes though.
Baseline (don't account for macro-fusion, so less instructions is considered better):
0000000100001000 movq $-0x80000, %rax
0000000100001007 nopw (%rax,%rax)
0000000100001010 movupd 0x80000(%rsi,%rax), %xmm0
0000000100001019 movupd 0x80000(%rdi,%rax), %xmm1
0000000100001022 maxpd %xmm1, %xmm0
0000000100001026 movupd 0x80010(%rdi,%rax), %xmm1
000000010000102f movupd 0x80020(%rdi,%rax), %xmm2
0000000100001038 movupd 0x80030(%rdi,%rax), %xmm3
0000000100001041 movupd %xmm0, 0x80000(%rdi,%rax)
000000010000104a movupd 0x80010(%rsi,%rax), %xmm0
0000000100001053 maxpd %xmm1, %xmm0
0000000100001057 movupd %xmm0, 0x80010(%rdi,%rax)
0000000100001060 movupd 0x80020(%rsi,%rax), %xmm0
0000000100001069 maxpd %xmm2, %xmm0
000000010000106d movupd 0x80030(%rsi,%rax), %xmm1
0000000100001076 movupd %xmm0, 0x80020(%rdi,%rax)
000000010000107f maxpd %xmm3, %xmm1
0000000100001083 movupd %xmm1, 0x80030(%rdi,%rax)
000000010000108c addq $0x40, %rax
0000000100001090 jne 0x100001010
0000000100001096 retq
Use macro-fusion in cost calc (extra instruction, but allows smaller constant offsets):
0000000100001000 xorl %eax, %eax
0000000100001002 nopw %cs:(%rax,%rax)
000000010000100c nopl (%rax)
0000000100001010 movupd (%rsi,%rax,8), %xmm0
0000000100001015 movupd 0x10(%rsi,%rax,8), %xmm1
000000010000101b movupd (%rdi,%rax,8), %xmm2
0000000100001020 maxpd %xmm2, %xmm0
0000000100001024 movupd 0x10(%rdi,%rax,8), %xmm2
000000010000102a maxpd %xmm2, %xmm1
000000010000102e movupd 0x20(%rdi,%rax,8), %xmm2
0000000100001034 movupd 0x30(%rdi,%rax,8), %xmm3
000000010000103a movupd %xmm0, (%rdi,%rax,8)
000000010000103f movupd %xmm1, 0x10(%rdi,%rax,8)
0000000100001045 movupd 0x20(%rsi,%rax,8), %xmm0
000000010000104b maxpd %xmm2, %xmm0
000000010000104f movupd 0x30(%rsi,%rax,8), %xmm1
0000000100001055 maxpd %xmm3, %xmm1
0000000100001059 movupd %xmm0, 0x20(%rdi,%rax,8)
000000010000105f movupd %xmm1, 0x30(%rdi,%rax,8)
0000000100001065 addq $0x8, %rax
0000000100001069 cmpq $0x10000, %rax
000000010000106f jne 0x100001010
0000000100001071 retq
https://reviews.llvm.org/D42607
More information about the llvm-commits
mailing list