[PATCH] D63972: [PowerPC] Do the Early Return for the li and unconditional branch
Zhang Kang via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 3 08:41:25 PDT 2019
ZhangKang added a comment.
In D63972#1565274 <https://reviews.llvm.org/D63972#1565274>, @efriedma wrote:
> This seems like simple tail duplication, which the target-independent taildup pass should handle. Can you give an example which taildup doesn't handle?
@efriedma , I am sorry for late reply. I spend some time to find the case and investigate the reason.
The pass `tail duplication` is for unconditional branch Tail BB. Most of cases I write manually can be optimized by the `tail duplication` pass.
But there are still some cases can't be optimized by the `tail duplication` pass, because the pattern will be created after `tail duplication` pass.
Below case is from SPEC, I have narrow down the origin case, also SPEC has many other cases can trigger the early-ret I wrote after running the `tail duplication` pass:
; ModuleID = 'HashXMLCh.ll'
source_filename = "HashXMLCh.cpp"
target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
%"class.xercesc_2_7::HashXMLCh" = type { %"class.xercesc_2_7::HashBase" }
%"class.xercesc_2_7::HashBase" = type { i32 (...)** }
; Function Attrs: norecurse nounwind readonly
define dso_local zeroext i1 @_ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_(%"class.xercesc_2_7::HashXMLCh"* nocapture readnone %this, i8* readonly %key1, i8* readonly %key2) unnamed_addr #0 align 2 {
entry:
%0 = bitcast i8* %key1 to i16*
%1 = bitcast i8* %key2 to i16*
%cmp.i = icmp eq i8* %key1, null
%cmp1.i = icmp eq i8* %key2, null
%or.cond.i = or i1 %cmp.i, %cmp1.i
br i1 %or.cond.i, label %if.then.i, label %while.cond.preheader.i
while.cond.preheader.i: ; preds = %entry
%2 = load i16, i16* %0, align 2, !tbaa !2
%3 = load i16, i16* %1, align 2, !tbaa !2
%cmp926.i = icmp eq i16 %2, %3
br i1 %cmp926.i, label %while.body.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
if.then.i: ; preds = %entry
br i1 %cmp.i, label %lor.lhs.false3.i, label %land.lhs.true.i
land.lhs.true.i: ; preds = %if.then.i
%4 = load i16, i16* %0, align 2, !tbaa !2
%tobool.i = icmp eq i16 %4, 0
br i1 %tobool.i, label %lor.lhs.false3.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
lor.lhs.false3.i: ; preds = %land.lhs.true.i, %if.then.i
br i1 %cmp1.i, label %if.else.i, label %land.lhs.true5.i
land.lhs.true5.i: ; preds = %lor.lhs.false3.i
%5 = load i16, i16* %1, align 2, !tbaa !2
%tobool6.i = icmp eq i16 %5, 0
br i1 %tobool6.i, label %if.else.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
if.else.i: ; preds = %land.lhs.true5.i, %lor.lhs.false3.i
br label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
while.body.i: ; preds = %while.cond.preheader.i, %if.end12.i
%6 = phi i16 [ %7, %if.end12.i ], [ %2, %while.cond.preheader.i ]
%psz2.028.i = phi i16* [ %incdec.ptr13.i, %if.end12.i ], [ %1, %while.cond.preheader.i ]
%psz1.027.i = phi i16* [ %incdec.ptr.i, %if.end12.i ], [ %0, %while.cond.preheader.i ]
%tobool10.i = icmp eq i16 %6, 0
br i1 %tobool10.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit, label %if.end12.i
if.end12.i: ; preds = %while.body.i
%incdec.ptr.i = getelementptr inbounds i16, i16* %psz1.027.i, i64 1
%incdec.ptr13.i = getelementptr inbounds i16, i16* %psz2.028.i, i64 1
%7 = load i16, i16* %incdec.ptr.i, align 2, !tbaa !2
%8 = load i16, i16* %incdec.ptr13.i, align 2, !tbaa !2
%cmp9.i = icmp eq i16 %7, %8
br i1 %cmp9.i, label %while.body.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit: ; preds = %if.end12.i, %while.body.i, %if.else.i, %land.lhs.true5.i, %land.lhs.true.i, %while.cond.preheader.i
%retval.0.i = phi i1 [ true, %if.else.i ], [ false, %land.lhs.true.i ], [ false, %land.lhs.true5.i ], [ false, %while.cond.preheader.i ], [ true, %while.body.i ], [ false, %if.end12.i ]
ret i1 %retval.0.i
}
attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 9.0.0 (git at github.ibm.com:compiler/llvm-project.git ab758ba128c46ba30cad058b89991852f7be5543)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"short", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C++ TBAA"}
We will get below assembly without this patch:
1 .text
2 .abiversion 2
3 .file "HashXMLCh.cpp"
4 .globl _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_ # -- Begin function _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_
5 .p2align 4
6 .type _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_, at function
7 _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_: # @_ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_
8 .Lfunc_begin0:
9 # %bb.0: # %entry
10 cmpdi 1, 4, 0
11 cmpdi 5, 0
12 cror 20, 6, 2
13 bc 4, 20, .LBB0_6
14 # %bb.1: # %if.then.i
15 bc 12, 6, .LBB0_3
16 # %bb.2: # %land.lhs.true.i
17 lhz 4, 0(4)
18 li 3, 0
19 cmplwi 1, 4, 0
20 bnelr 1
21 .LBB0_3: # %lor.lhs.false3.i
22 bc 12, 2, .LBB0_5
23 # %bb.4: # %land.lhs.true5.i
24 lhz 4, 0(5)
25 li 3, 0
26 cmplwi 4, 0
27 bnelr 0
28 .LBB0_5: # %if.else.i
29 b .LBB0_10
30 .LBB0_6: # %while.cond.preheader.i
31 lhz 8, 0(4)
32 lhz 6, 0(5)
33 li 3, 0
34 cmplw 8, 6
35 bnelr 0
36 # %bb.7: # %while.body.i.preheader
37 addi 6, 5, 2
38 addi 7, 4, 2
39 .p2align 4
40 .LBB0_8: # %while.body.i
41 # =>This Inner Loop Header: Depth=1
42 andi. 8, 8, 65535
43 beq 0, .LBB0_10
44 # %bb.9: # %if.end12.i
45 # in Loop: Header=BB0_8 Depth=1
46 addi 5, 5, 2
47 addi 4, 4, 2
48 lhz 8, 0(4)
49 lhz 9, 0(5)
50 addi 6, 6, 2
51 addi 7, 7, 2
52 cmplw 8, 9
53 beq 0, .LBB0_8
54 blr
55 .LBB0_10:
56 li 3, 1
57 blr
58 .long 0
59 .quad 0
60 .Lfunc_end0:
61 .size _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_, .Lfunc_end0-.Lfunc_begin0
62 # -- End function
63
64 .ident "clang version 9.0.0 (git at github.ibm.com:compiler/llvm-project.git ab758ba128c46ba30cad058b89991852f7be5543)"
65 .section ".note.GNU-stack","", at progbits
We can see the line 29 `b .LBB0_10` is unconditional branch to `.LBB0_10`,
Below is the `.Lbb0_10`
55 .LBB0_10:
56 li 3, 1
57 blr
This patch will optimize the line 29 to
29 li 3, 1
30 blr
The line 29 `b .LBB0_10` is created after running the pass `branch-folder`, The pass `branch-folder` is after the `tail duplication` pass.
Improve the conditional branch to early-ret like ` the line `43 beq 0, .LBB0_10` will be my next patch. I only improve the unconditional branch for this patch.
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D63972/new/
https://reviews.llvm.org/D63972
More information about the llvm-commits
mailing list