[PATCH] D63692: [LSR] Improved code generation for Zero Compare loops

Wed Jun 26 02:11:52 PDT 2019

joanlluch added a comment.

Hi Craig, thanks for commenting.
Yes, I was actually compiling for -Oz, but the differences when using -Os are even bigger. Let me try to explain every case.

- For Oz, the compiler indeed generates expensive push/pop instructions as an attempt to reduce code size. However, after the patch is applied these instructions are removed without increasing code size. I have checked that in several scenarios and the result is either the same size or less size. This is because the patch reduces the overall number of instructions. On the examples above, the resulting code size is identical because there's the fo and it's even reduced in some cases. This is because the sequence:

  	pushl	$-8                     ## encoding: [0x6a,0xf8]
  	popl	%edi                    ## encoding: [0x5f]
  	subl	12(%ebp), %edi          ## encoding: [0x2b,0x7d,0x0c]

is replaced by:

  	movl	12(%ebp), %edi          ## encoding: [0x8b,0x7d,0x0c]
  	addl	$8, %edi                ## encoding: [0x83,0xc7,0x08]

It's a total of 6 machine code bytes in both cases, but one less instruction after the patch.
This is just an example, there are many cases where both the code size and number of instructions are reduced.

- Now, if we compile the same with the -Os flag we get even further improvements. this is the resulting code before and after for the same source code:

Before:

  	.section	__TEXT,__text,regular,pure_instructions
  	.macosx_version_min 10, 12
  	.globl	_LSRTestA               ## -- Begin function LSRTestA
  _LSRTestA:                              ## @LSRTestA
  	.cfi_startproc
  ## %bb.0:                               ## %entry
  	pushl	%ebp                    ## encoding: [0x55]
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
  	.cfi_def_cfa_register %ebp
  	pushl	%edi                    ## encoding: [0x57]
  	pushl	%esi                    ## encoding: [0x56]
  	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	12(%ebp), %eax          ## encoding: [0x8b,0x45,0x0c]
  	cmpl	$-8, %eax               ## encoding: [0x83,0xf8,0xf8]
  	je	LBB0_3                  ## encoding: [0x74,A]
                                          ##   fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
  ## %bb.1:                               ## %while.body.preheader
  	movl	8(%ebp), %esi           ## encoding: [0x8b,0x75,0x08]
  	movl	$-8, %edi               ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
  	subl	%eax, %edi              ## encoding: [0x29,0xc7]
  LBB0_2:                                 ## %while.body
                                          ## =>This Inner Loop Header: Depth=1
  	movl	%esi, (%esp)            ## encoding: [0x89,0x34,0x24]
  	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                          ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
  	incl	%edi                    ## encoding: [0x47]
  	jne	LBB0_2                  ## encoding: [0x75,A]
                                          ##   fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
  LBB0_3:                                 ## %while.end
  	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
  	popl	%esi                    ## encoding: [0x5e]
  	popl	%edi                    ## encoding: [0x5f]
  	popl	%ebp                    ## encoding: [0x5d]
  	retl                            ## encoding: [0xc3]
  	.cfi_endproc
                                          ## -- End function
  	.globl	_LSRTestB               ## -- Begin function LSRTestB
  _LSRTestB:                              ## @LSRTestB
  	.cfi_startproc
  ## %bb.0:                               ## %entry
  	pushl	%ebp                    ## encoding: [0x55]
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
  	.cfi_def_cfa_register %ebp
  	pushl	%edi                    ## encoding: [0x57]
  	pushl	%esi                    ## encoding: [0x56]
  	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	12(%ebp), %eax          ## encoding: [0x8b,0x45,0x0c]
  	cmpl	$-8, %eax               ## encoding: [0x83,0xf8,0xf8]
  	je	LBB1_3                  ## encoding: [0x74,A]
                                          ##   fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
  ## %bb.1:                               ## %for.body.preheader
  	movl	8(%ebp), %esi           ## encoding: [0x8b,0x75,0x08]
  	movl	$-8, %edi               ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
  	subl	%eax, %edi              ## encoding: [0x29,0xc7]
  LBB1_2:                                 ## %for.body
                                          ## =>This Inner Loop Header: Depth=1
  	movl	%esi, (%esp)            ## encoding: [0x89,0x34,0x24]
  	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                          ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
  	decl	%edi                    ## encoding: [0x4f]
  	jne	LBB1_2                  ## encoding: [0x75,A]
                                          ##   fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
  LBB1_3:                                 ## %for.end
  	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
  	popl	%esi                    ## encoding: [0x5e]
  	popl	%edi                    ## encoding: [0x5f]
  	popl	%ebp                    ## encoding: [0x5d]
  	retl                            ## encoding: [0xc3]
  	.cfi_endproc

After:

  	.section	__TEXT,__text,regular,pure_instructions
  	.macosx_version_min 10, 12
  	.globl	_LSRTestA               ## -- Begin function LSRTestA
  _LSRTestA:                              ## @LSRTestA
  	.cfi_startproc
  ## %bb.0:                               ## %entry
  	pushl	%ebp                    ## encoding: [0x55]
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
  	.cfi_def_cfa_register %ebp
  	pushl	%edi                    ## encoding: [0x57]
  	pushl	%esi                    ## encoding: [0x56]
  	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	12(%ebp), %esi          ## encoding: [0x8b,0x75,0x0c]
  	addl	$8, %esi                ## encoding: [0x83,0xc6,0x08]
  	je	LBB0_3                  ## encoding: [0x74,A]
                                          ##   fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
  ## %bb.1:                               ## %while.body.preheader
  	movl	8(%ebp), %edi           ## encoding: [0x8b,0x7d,0x08]
  LBB0_2:                                 ## %while.body
                                          ## =>This Inner Loop Header: Depth=1
  	movl	%edi, (%esp)            ## encoding: [0x89,0x3c,0x24]
  	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                          ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
  	decl	%esi                    ## encoding: [0x4e]
  	jne	LBB0_2                  ## encoding: [0x75,A]
                                          ##   fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
  LBB0_3:                                 ## %while.end
  	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
  	popl	%esi                    ## encoding: [0x5e]
  	popl	%edi                    ## encoding: [0x5f]
  	popl	%ebp                    ## encoding: [0x5d]
  	retl                            ## encoding: [0xc3]
  	.cfi_endproc
                                          ## -- End function
  	.globl	_LSRTestB               ## -- Begin function LSRTestB
  _LSRTestB:                              ## @LSRTestB
  	.cfi_startproc
  ## %bb.0:                               ## %entry
  	pushl	%ebp                    ## encoding: [0x55]
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
  	.cfi_def_cfa_register %ebp
  	pushl	%edi                    ## encoding: [0x57]
  	pushl	%esi                    ## encoding: [0x56]
  	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	12(%ebp), %esi          ## encoding: [0x8b,0x75,0x0c]
  	addl	$8, %esi                ## encoding: [0x83,0xc6,0x08]
  	je	LBB1_3                  ## encoding: [0x74,A]
                                          ##   fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
  ## %bb.1:                               ## %for.body.preheader
  	movl	8(%ebp), %edi           ## encoding: [0x8b,0x7d,0x08]
  LBB1_2:                                 ## %for.body
                                          ## =>This Inner Loop Header: Depth=1
  	movl	%edi, (%esp)            ## encoding: [0x89,0x3c,0x24]
  	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                          ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
  	incl	%esi                    ## encoding: [0x46]
  	jne	LBB1_2                  ## encoding: [0x75,A]
                                          ##   fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
  LBB1_3:                                 ## %for.end
  	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
  	popl	%esi                    ## encoding: [0x5e]
  	popl	%edi                    ## encoding: [0x5f]
  	popl	%ebp                    ## encoding: [0x5d]
  	retl                            ## encoding: [0xc3]
  	.cfi_endproc

So in this case we get an improvement of 2 less instructions per function and 7 less bytes per function.

The same sort of improvment is obtained with the -O3 setting and the remaining ones. 
Please let me know if you need me to perform some specific test.

Thanks,

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D63692/new/

https://reviews.llvm.org/D63692