[PATCH] D63972: [PowerPC] Do the Early Return for the li and unconditional branch

Wed Jul 3 08:41:25 PDT 2019

ZhangKang added a comment.

In D63972#1565274 <https://reviews.llvm.org/D63972#1565274>, @efriedma wrote:

> This seems like simple tail duplication, which the target-independent taildup pass should handle.  Can you give an example which taildup doesn't handle?

@efriedma  , I am sorry for late reply. I spend some time to find the case and investigate the reason. 
The pass `tail duplication` is for unconditional branch Tail BB.  Most of cases I write manually can be optimized by the `tail duplication` pass. 
But there are still some cases can't be optimized by the `tail duplication` pass, because the pattern will be created after `tail duplication` pass.

Below case is from SPEC, I have narrow down the origin case, also SPEC has many other cases can trigger the early-ret I wrote after running the `tail duplication` pass:

  ; ModuleID = 'HashXMLCh.ll'
  source_filename = "HashXMLCh.cpp"
  target datalayout = "e-m:e-i64:64-n32:64"
  target triple = "powerpc64le-unknown-linux-gnu"
  %"class.xercesc_2_7::HashXMLCh" = type { %"class.xercesc_2_7::HashBase" }
  %"class.xercesc_2_7::HashBase" = type { i32 (...)** }
  ; Function Attrs: norecurse nounwind readonly
  define dso_local zeroext i1 @_ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_(%"class.xercesc_2_7::HashXMLCh"* nocapture readnone %this, i8* readonly %key1, i8* readonly %key2) unnamed_addr #0 align 2 {
    %0 = bitcast i8* %key1 to i16*
    %1 = bitcast i8* %key2 to i16*
    %cmp.i = icmp eq i8* %key1, null
    %cmp1.i = icmp eq i8* %key2, null
    %or.cond.i = or i1 %cmp.i, %cmp1.i
    br i1 %or.cond.i, label %if.then.i, label %while.cond.preheader.i
  while.cond.preheader.i:                           ; preds = %entry
    %2 = load i16, i16* %0, align 2, !tbaa !2
    %3 = load i16, i16* %1, align 2, !tbaa !2
    %cmp926.i = icmp eq i16 %2, %3
    br i1 %cmp926.i, label %while.body.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
  if.then.i:                                        ; preds = %entry
    br i1 %cmp.i, label %lor.lhs.false3.i, label %land.lhs.true.i
  land.lhs.true.i:                                  ; preds = %if.then.i
    %4 = load i16, i16* %0, align 2, !tbaa !2
    %tobool.i = icmp eq i16 %4, 0
    br i1 %tobool.i, label %lor.lhs.false3.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
  lor.lhs.false3.i:                                 ; preds = %land.lhs.true.i, %if.then.i
    br i1 %cmp1.i, label %if.else.i, label %land.lhs.true5.i
  land.lhs.true5.i:                                 ; preds = %lor.lhs.false3.i
    %5 = load i16, i16* %1, align 2, !tbaa !2
    %tobool6.i = icmp eq i16 %5, 0
    br i1 %tobool6.i, label %if.else.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
  if.else.i:                                        ; preds = %land.lhs.true5.i, %lor.lhs.false3.i
    br label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
  while.body.i:                                     ; preds = %while.cond.preheader.i, %if.end12.i
    %6 = phi i16 [ %7, %if.end12.i ], [ %2, %while.cond.preheader.i ]
    %psz2.028.i = phi i16* [ %incdec.ptr13.i, %if.end12.i ], [ %1, %while.cond.preheader.i ]
    %psz1.027.i = phi i16* [ %incdec.ptr.i, %if.end12.i ], [ %0, %while.cond.preheader.i ]
    %tobool10.i = icmp eq i16 %6, 0
    br i1 %tobool10.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit, label %if.end12.i
  if.end12.i:                                       ; preds = %while.body.i
    %incdec.ptr.i = getelementptr inbounds i16, i16* %psz1.027.i, i64 1
    %incdec.ptr13.i = getelementptr inbounds i16, i16* %psz2.028.i, i64 1
    %7 = load i16, i16* %incdec.ptr.i, align 2, !tbaa !2
    %8 = load i16, i16* %incdec.ptr13.i, align 2, !tbaa !2
    %cmp9.i = icmp eq i16 %7, %8
    br i1 %cmp9.i, label %while.body.i, label %_ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit
  _ZN11xercesc_2_79XMLString6equalsEPKtS2_.exit:    ; preds = %if.end12.i, %while.body.i, %if.else.i, %land.lhs.true5.i, %land.lhs.true.i, %while.cond.preheader.i
    %retval.0.i = phi i1 [ true, %if.else.i ], [ false, %land.lhs.true.i ], [ false, %land.lhs.true5.i ], [ false, %while.cond.preheader.i ], [ true, %while.body.i ], [ false, %if.end12.i ]
    ret i1 %retval.0.i
  attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
  !llvm.module.flags = !{!0}
  !llvm.ident = !{!1}
  !0 = !{i32 1, !"wchar_size", i32 4}
  !1 = !{!"clang version 9.0.0 (git at github.ibm.com:compiler/llvm-project.git ab758ba128c46ba30cad058b89991852f7be5543)"}
  !2 = !{!3, !3, i64 0}
  !3 = !{!"short", !4, i64 0}
  !4 = !{!"omnipotent char", !5, i64 0}
  !5 = !{!"Simple C++ TBAA"}

We will get below assembly without this patch:

  1   .text
    2   .abiversion 2
    3   .file "HashXMLCh.cpp"
    4   .globl  _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_ # -- Begin function _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_
    5   .p2align  4
    6   .type _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_, at function
    7 _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_: # @_ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_
    8 .Lfunc_begin0:
    9 # %bb.0:                                # %entry
   10   cmpdi 1, 4, 0
   11   cmpdi 5, 0
   12   cror 20, 6, 2
   13   bc 4, 20, .LBB0_6
   14 # %bb.1:                                # %if.then.i
   15   bc 12, 6, .LBB0_3
   16 # %bb.2:                                # %land.lhs.true.i
   17   lhz 4, 0(4)
   18   li 3, 0
   19   cmplwi 1, 4, 0
   20   bnelr 1
   21 .LBB0_3:                                # %lor.lhs.false3.i
   22   bc 12, 2, .LBB0_5
   23 # %bb.4:                                # %land.lhs.true5.i
   24   lhz 4, 0(5)
   25   li 3, 0
   26   cmplwi  4, 0
   27   bnelr 0
   28 .LBB0_5:                                # %if.else.i
   29   b .LBB0_10
   30 .LBB0_6:                                # %while.cond.preheader.i
   31   lhz 8, 0(4)
   32   lhz 6, 0(5)
   33   li 3, 0
   34   cmplw 8, 6
   35   bnelr 0
   36 # %bb.7:                                # %while.body.i.preheader
   37   addi 6, 5, 2
   38   addi 7, 4, 2
   39   .p2align  4
   40 .LBB0_8:                                # %while.body.i
   41                                         # =>This Inner Loop Header: Depth=1
   42   andi. 8, 8, 65535
   43   beq 0, .LBB0_10
   44 # %bb.9:                                # %if.end12.i
   45                                         #   in Loop: Header=BB0_8 Depth=1
   46   addi 5, 5, 2
   47   addi 4, 4, 2
   48   lhz 8, 0(4)
   49   lhz 9, 0(5)
   50   addi 6, 6, 2
   51   addi 7, 7, 2
   52   cmplw 8, 9
   53   beq 0, .LBB0_8
   54   blr
   55 .LBB0_10:
   56   li 3, 1
   57   blr
   58   .long 0
   59   .quad 0
   60 .Lfunc_end0:
   61   .size _ZN11xercesc_2_79HashXMLCh6equalsEPKvS2_, .Lfunc_end0-.Lfunc_begin0
   62                                         # -- End function
   64   .ident  "clang version 9.0.0 (git at github.ibm.com:compiler/llvm-project.git ab758ba128c46ba30cad058b89991852f7be5543)"
   65   .section  ".note.GNU-stack","", at progbits

We can see the line 29 `b .LBB0_10`  is unconditional branch to `.LBB0_10`,
Below is the `.Lbb0_10`

  55 .LBB0_10:
  56     li 3, 1
  57   blr

This patch will optimize the line 29 to

  29   li 3, 1
   30   blr

The line 29 `b .LBB0_10`  is created after running the pass `branch-folder`, The pass  `branch-folder` is after the `tail duplication` pass.

Improve the conditional branch to early-ret like ` the line `43   beq 0, .LBB0_10` will be my next patch. I only improve the unconditional branch for this patch.



