[llvm-dev] Code optimisation regression(?) in loops compiled with the -Oz flag
Florian Hahn via llvm-dev
llvm-dev at lists.llvm.org
Wed Sep 25 01:07:04 PDT 2019
Hi
> On Sep 25, 2019, at 08:31, Joan Lluch via llvm-dev <llvm-dev at lists.llvm.org> wrote:
>
> Hi All,
>
> This simple loop code
>
> void loopTest()
> {
> for ( int i = 0 ; i<10 ; i++ ) {
> ftest();
> }
> }
>
> gets converted into this when compiled with the -Os flags:
>
> ; Function Attrs: nounwind optsize uwtable
> define void @loopTest() local_unnamed_addr #0 {
> entry:
> br label %for.body
>
> for.cond.cleanup: ; preds = %for.body
> ret void
>
> for.body: ; preds = %for.body, %entry
> %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
> %call = tail call i32 (...) @ftest() #2
> %inc = add nuw nsw i32 %i.03, 1
> %exitcond = icmp eq i32 %inc, 10
> br i1 %exitcond, label %for.cond.cleanup, label %for.body
> }
>
> For the x86 architecture, this gets compiled into this
>
> .section __TEXT,__text,regular,pure_instructions
> .macosx_version_min 10, 12
> .globl _loopTest ## -- Begin function loopTest
> _loopTest: ## @loopTest
> .cfi_startproc
> ## %bb.0: ## %entry
> pushq %rbp ## encoding: [0x55]
> .cfi_def_cfa_offset 16
> .cfi_offset %rbp, -16
> movq %rsp, %rbp ## encoding: [0x48,0x89,0xe5]
> .cfi_def_cfa_register %rbp
> pushq %rbx ## encoding: [0x53]
> pushq %rax ## encoding: [0x50]
> .cfi_offset %rbx, -24
> movl $10, %ebx ## encoding: [0xbb,0x0a,0x00,0x00,0x00]
> LBB0_1: ## %for.body
> ## =>This Inner Loop Header: Depth=1
> xorl %eax, %eax ## encoding: [0x31,0xc0]
> callq _ftest ## encoding: [0xe8,A,A,A,A]
> ## fixup A - offset: 1, value: _ftest-4, kind: reloc_branch_4byte_pcrel
> decl %ebx ## encoding: [0xff,0xcb]
> jne LBB0_1 ## encoding: [0x75,A]
> ## fixup A - offset: 1, value: LBB0_1-1, kind: FK_PCRel_1
> ## %bb.2: ## %for.cond.cleanup
> addq $8, %rsp ## encoding: [0x48,0x83,0xc4,0x08]
> popq %rbx ## encoding: [0x5b]
> popq %rbp ## encoding: [0x5d]
> retq ## encoding: [0xc3]
> .cfi_endproc
> ## -- End function
>
>
>
> The same code compiled with -Oz results in the following:
>
> ; Function Attrs: minsize nounwind optsize uwtable
> define void @loopTest() local_unnamed_addr #0 {
> entry:
> br label %for.cond
>
> for.cond: ; preds = %for.body, %entry
> %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
> %exitcond = icmp eq i32 %i.0, 10
> br i1 %exitcond, label %for.cond.cleanup, label %for.body
>
> for.cond.cleanup: ; preds = %for.cond
> ret void
>
> for.body: ; preds = %for.cond
> %call = tail call i32 (...) @ftest() #2
> %inc = add nuw nsw i32 %i.0, 1
> br label %for.cond
> }
>
> and this
>
> .section __TEXT,__text,regular,pure_instructions
> .macosx_version_min 10, 12
> .globl _loopTest ## -- Begin function loopTest
> _loopTest: ## @loopTest
> .cfi_startproc
> ## %bb.0: ## %entry
> pushq %rbp ## encoding: [0x55]
> .cfi_def_cfa_offset 16
> .cfi_offset %rbp, -16
> movq %rsp, %rbp ## encoding: [0x48,0x89,0xe5]
> .cfi_def_cfa_register %rbp
> pushq %rbx ## encoding: [0x53]
> pushq %rax ## encoding: [0x50]
> .cfi_offset %rbx, -24
> pushq $10 ## encoding: [0x6a,0x0a]
> popq %rbx ## encoding: [0x5b]
> LBB0_1: ## %for.cond
> ## =>This Inner Loop Header: Depth=1
> testl %ebx, %ebx ## encoding: [0x85,0xdb]
> je LBB0_2 ## encoding: [0x74,A]
> ## fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
> ## %bb.3: ## %for.body
> ## in Loop: Header=BB0_1 Depth=1
> xorl %eax, %eax ## encoding: [0x31,0xc0]
> callq _ftest ## encoding: [0xe8,A,A,A,A]
> ## fixup A - offset: 1, value: _ftest-4, kind: reloc_branch_4byte_pcrel
> decl %ebx ## encoding: [0xff,0xcb]
> jmp LBB0_1 ## encoding: [0xeb,A]
> ## fixup A - offset: 1, value: LBB0_1-1, kind: FK_PCRel_1
> LBB0_2: ## %for.cond.cleanup
> addq $8, %rsp ## encoding: [0x48,0x83,0xc4,0x08]
> popq %rbx ## encoding: [0x5b]
> popq %rbp ## encoding: [0x5d]
> retq ## encoding: [0xc3]
> .cfi_endproc
> ## -- End function
>
> The resulting loop body code for -Oz is longer than -Os. This is because the exit loop comparison is performed at the beginning of the loop resulting in an additional jump instruction and missing opportunity to fold the exit condition with the iv decrement.
>
The problem here is that with -Oz, LoopRotate is *very* conservative. Currently it is not clever enough to realize that the loop guard would always be true.
I’ve a patch that fixes that https://reviews.llvm.org/D61683 <https://reviews.llvm.org/D61683>. There were some code-size improvements, as well as regressions. I just need to find some time to look into the causes for the regressions.
Cheers,
Florian
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190925/5f9caf9f/attachment-0001.html>
More information about the llvm-dev
mailing list