[PATCH] D63692: [LSR] Improved code generation for Zero Compare loops
Joan LLuch via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 26 02:11:52 PDT 2019
joanlluch added a comment.
Hi Craig, thanks for commenting.
Yes, I was actually compiling for -Oz, but the differences when using -Os are even bigger. Let me try to explain every case.
- For Oz, the compiler indeed generates expensive push/pop instructions as an attempt to reduce code size. However, after the patch is applied these instructions are removed without increasing code size. I have checked that in several scenarios and the result is either the same size or less size. This is because the patch reduces the overall number of instructions. On the examples above, the resulting code size is identical because there's the fo and it's even reduced in some cases. This is because the sequence:
pushl $-8 ## encoding: [0x6a,0xf8]
popl %edi ## encoding: [0x5f]
subl 12(%ebp), %edi ## encoding: [0x2b,0x7d,0x0c]
is replaced by:
movl 12(%ebp), %edi ## encoding: [0x8b,0x7d,0x0c]
addl $8, %edi ## encoding: [0x83,0xc7,0x08]
It's a total of 6 machine code bytes in both cases, but one less instruction after the patch.
This is just an example, there are many cases where both the code size and number of instructions are reduced.
- Now, if we compile the same with the -Os flag we get even further improvements. this is the resulting code before and after for the same source code:
Before:
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.globl _LSRTestA ## -- Begin function LSRTestA
_LSRTestA: ## @LSRTestA
.cfi_startproc
## %bb.0: ## %entry
pushl %ebp ## encoding: [0x55]
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp ## encoding: [0x89,0xe5]
.cfi_def_cfa_register %ebp
pushl %edi ## encoding: [0x57]
pushl %esi ## encoding: [0x56]
subl $16, %esp ## encoding: [0x83,0xec,0x10]
.cfi_offset %esi, -16
.cfi_offset %edi, -12
movl 12(%ebp), %eax ## encoding: [0x8b,0x45,0x0c]
cmpl $-8, %eax ## encoding: [0x83,0xf8,0xf8]
je LBB0_3 ## encoding: [0x74,A]
## fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
## %bb.1: ## %while.body.preheader
movl 8(%ebp), %esi ## encoding: [0x8b,0x75,0x08]
movl $-8, %edi ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
subl %eax, %edi ## encoding: [0x29,0xc7]
LBB0_2: ## %while.body
## =>This Inner Loop Header: Depth=1
movl %esi, (%esp) ## encoding: [0x89,0x34,0x24]
calll _bar ## encoding: [0xe8,A,A,A,A]
## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
incl %edi ## encoding: [0x47]
jne LBB0_2 ## encoding: [0x75,A]
## fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
LBB0_3: ## %while.end
addl $16, %esp ## encoding: [0x83,0xc4,0x10]
popl %esi ## encoding: [0x5e]
popl %edi ## encoding: [0x5f]
popl %ebp ## encoding: [0x5d]
retl ## encoding: [0xc3]
.cfi_endproc
## -- End function
.globl _LSRTestB ## -- Begin function LSRTestB
_LSRTestB: ## @LSRTestB
.cfi_startproc
## %bb.0: ## %entry
pushl %ebp ## encoding: [0x55]
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp ## encoding: [0x89,0xe5]
.cfi_def_cfa_register %ebp
pushl %edi ## encoding: [0x57]
pushl %esi ## encoding: [0x56]
subl $16, %esp ## encoding: [0x83,0xec,0x10]
.cfi_offset %esi, -16
.cfi_offset %edi, -12
movl 12(%ebp), %eax ## encoding: [0x8b,0x45,0x0c]
cmpl $-8, %eax ## encoding: [0x83,0xf8,0xf8]
je LBB1_3 ## encoding: [0x74,A]
## fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
## %bb.1: ## %for.body.preheader
movl 8(%ebp), %esi ## encoding: [0x8b,0x75,0x08]
movl $-8, %edi ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
subl %eax, %edi ## encoding: [0x29,0xc7]
LBB1_2: ## %for.body
## =>This Inner Loop Header: Depth=1
movl %esi, (%esp) ## encoding: [0x89,0x34,0x24]
calll _bar ## encoding: [0xe8,A,A,A,A]
## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
decl %edi ## encoding: [0x4f]
jne LBB1_2 ## encoding: [0x75,A]
## fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
LBB1_3: ## %for.end
addl $16, %esp ## encoding: [0x83,0xc4,0x10]
popl %esi ## encoding: [0x5e]
popl %edi ## encoding: [0x5f]
popl %ebp ## encoding: [0x5d]
retl ## encoding: [0xc3]
.cfi_endproc
After:
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.globl _LSRTestA ## -- Begin function LSRTestA
_LSRTestA: ## @LSRTestA
.cfi_startproc
## %bb.0: ## %entry
pushl %ebp ## encoding: [0x55]
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp ## encoding: [0x89,0xe5]
.cfi_def_cfa_register %ebp
pushl %edi ## encoding: [0x57]
pushl %esi ## encoding: [0x56]
subl $16, %esp ## encoding: [0x83,0xec,0x10]
.cfi_offset %esi, -16
.cfi_offset %edi, -12
movl 12(%ebp), %esi ## encoding: [0x8b,0x75,0x0c]
addl $8, %esi ## encoding: [0x83,0xc6,0x08]
je LBB0_3 ## encoding: [0x74,A]
## fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
## %bb.1: ## %while.body.preheader
movl 8(%ebp), %edi ## encoding: [0x8b,0x7d,0x08]
LBB0_2: ## %while.body
## =>This Inner Loop Header: Depth=1
movl %edi, (%esp) ## encoding: [0x89,0x3c,0x24]
calll _bar ## encoding: [0xe8,A,A,A,A]
## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
decl %esi ## encoding: [0x4e]
jne LBB0_2 ## encoding: [0x75,A]
## fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
LBB0_3: ## %while.end
addl $16, %esp ## encoding: [0x83,0xc4,0x10]
popl %esi ## encoding: [0x5e]
popl %edi ## encoding: [0x5f]
popl %ebp ## encoding: [0x5d]
retl ## encoding: [0xc3]
.cfi_endproc
## -- End function
.globl _LSRTestB ## -- Begin function LSRTestB
_LSRTestB: ## @LSRTestB
.cfi_startproc
## %bb.0: ## %entry
pushl %ebp ## encoding: [0x55]
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp ## encoding: [0x89,0xe5]
.cfi_def_cfa_register %ebp
pushl %edi ## encoding: [0x57]
pushl %esi ## encoding: [0x56]
subl $16, %esp ## encoding: [0x83,0xec,0x10]
.cfi_offset %esi, -16
.cfi_offset %edi, -12
movl 12(%ebp), %esi ## encoding: [0x8b,0x75,0x0c]
addl $8, %esi ## encoding: [0x83,0xc6,0x08]
je LBB1_3 ## encoding: [0x74,A]
## fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
## %bb.1: ## %for.body.preheader
movl 8(%ebp), %edi ## encoding: [0x8b,0x7d,0x08]
LBB1_2: ## %for.body
## =>This Inner Loop Header: Depth=1
movl %edi, (%esp) ## encoding: [0x89,0x3c,0x24]
calll _bar ## encoding: [0xe8,A,A,A,A]
## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
incl %esi ## encoding: [0x46]
jne LBB1_2 ## encoding: [0x75,A]
## fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
LBB1_3: ## %for.end
addl $16, %esp ## encoding: [0x83,0xc4,0x10]
popl %esi ## encoding: [0x5e]
popl %edi ## encoding: [0x5f]
popl %ebp ## encoding: [0x5d]
retl ## encoding: [0xc3]
.cfi_endproc
So in this case we get an improvement of 2 less instructions per function and 7 less bytes per function.
The same sort of improvment is obtained with the -O3 setting and the remaining ones.
Please let me know if you need me to perform some specific test.
Thanks,
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D63692/new/
https://reviews.llvm.org/D63692
More information about the llvm-commits
mailing list