[PATCH] D63692: [LSR] Improved code generation for Zero Compare loops

Joan LLuch via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 26 00:57:29 PDT 2019


joanlluch added a comment.

Hi shchenz,

Thanks for taking the time for reviewing this. I solved your reported assertion crash on SystemZ, and it is now fixed.

I stated on the description comment that I fixed codegen tests for X86 and ARM, but I'm waiting feedback for other platforms because I'm not that versed on them. It would just be very easy for me to just run "update_llc_test_checks.py" and forget, but I do not think that this is what the community would want to see. On the other hand, I understand that you can help with the PowerPC platform, so maybe you can look at that.

This patch improves code generation not because it choses a particular induction variable direction (increment or decrement) but because it choses the less costly or most natural one. Following there are a couple of examples that show improvements for both positive and negative increments:

  void LSRTestA( int a, unsigned ammount )
  {
    ammount += 8;
    while ( ammount-- )
      bar(a);
  }
  
  void LSRTestB( int a, unsigned ammount )
  {
    ammount += 8;
    for ( ; ammount != 0 ; ammount++ )
      bar(a);
  }

Before:

  	.section	__TEXT,__text,regular,pure_instructions
  	.macosx_version_min 10, 12
  	.globl	_LSRTestA
  _LSRTestA:
  	.cfi_startproc
  	pushl	%ebp
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp
  	.cfi_def_cfa_register %ebp
  	pushl	%edi
  	pushl	%esi
  	subl	$16, %esp
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	8(%ebp), %esi
  	pushl	$-8
  	popl	%edi
  	subl	12(%ebp), %edi
  	jmp	LBB0_1
  LBB0_2:
  	movl	%esi, (%esp)
  	calll	_bar
  	incl	%edi
  LBB0_1:
  	testl	%edi, %edi
  	jne	LBB0_2
  	addl	$16, %esp
  	popl	%esi
  	popl	%edi
  	popl	%ebp
  	retl
  	.cfi_endproc
  
  	.globl	_LSRTestB
  _LSRTestB:
  	.cfi_startproc
  	pushl	%ebp
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp
  	.cfi_def_cfa_register %ebp
  	pushl	%edi
  	pushl	%esi
  	subl	$16, %esp
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	8(%ebp), %esi
  	pushl	$-8
  	popl	%edi
  	subl	12(%ebp), %edi
  	jmp	LBB1_1
  LBB1_2:
  	movl	%esi, (%esp)
  	calll	_bar
  	decl	%edi
  LBB1_1:
  	testl	%edi, %edi
  	jne	LBB1_2
  	addl	$16, %esp
  	popl	%esi
  	popl	%edi
  	popl	%ebp
  	retl
  	.cfi_endproc

After:

  	.section	__TEXT,__text,regular,pure_instructions
  	.macosx_version_min 10, 12
  	.globl	_LSRTestA
  _LSRTestA:
  	.cfi_startproc
  	pushl	%ebp
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp
  	.cfi_def_cfa_register %ebp
  	pushl	%edi
  	pushl	%esi
  	subl	$16, %esp
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	8(%ebp), %esi
  	movl	12(%ebp), %edi
  	addl	$8, %edi
  	jmp	LBB0_1
  LBB0_2:
  	decl	%edi
  	movl	%esi, (%esp)
  	calll	_bar
  LBB0_1:
  	testl	%edi, %edi
  	jne	LBB0_2
  	addl	$16, %esp
  	popl	%esi
  	popl	%edi
  	popl	%ebp
  	retl
  	.cfi_endproc
  
  	.globl	_LSRTestB
  _LSRTestB:
  	.cfi_startproc
  	pushl	%ebp
  	.cfi_def_cfa_offset 8
  	.cfi_offset %ebp, -8
  	movl	%esp, %ebp
  	.cfi_def_cfa_register %ebp
  	pushl	%edi
  	pushl	%esi
  	subl	$16, %esp
  	.cfi_offset %esi, -16
  	.cfi_offset %edi, -12
  	movl	8(%ebp), %esi
  	movl	12(%ebp), %edi
  	addl	$8, %edi
  	jmp	LBB1_1
  LBB1_2:
  	movl	%esi, (%esp)
  	calll	_bar
  	incl	%edi
  LBB1_1:
  	testl	%edi, %edi
  	jne	LBB1_2
  	addl	$16, %esp
  	popl	%esi
  	popl	%edi
  	popl	%ebp
  	retl
  	.cfi_endproc

On these examples, a pair of potentially expensive push, pop instructions are replaced by a mov.

About PowerPC, I'm not that versed on it. If it needs the loop count to be always positive to take advantage of hardware, then I suggest you to propose a patch for that. Maybe we can work together if you agree. My current proposal does not particularly favour any count direction, but given equal cost it tends to honour the one specified on the source code. So at the end of the day, I think that even for PowerPC it should still result in overall improvements because most loops are positive counting anyway. Please correct me if I am wrong.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D63692/new/

https://reviews.llvm.org/D63692





More information about the llvm-commits mailing list