[llvm] 129c911 - Include static prof data when collecting loop BBs

Bill Wendling via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 19 11:34:03 PST 2020


Author: Bill Wendling
Date: 2020-02-19T11:33:48-08:00
New Revision: 129c911efaa492790c251b3eb18e4db36b55cbc5

URL: https://github.com/llvm/llvm-project/commit/129c911efaa492790c251b3eb18e4db36b55cbc5
DIFF: https://github.com/llvm/llvm-project/commit/129c911efaa492790c251b3eb18e4db36b55cbc5.diff

LOG: Include static prof data when collecting loop BBs

Summary:
If the programmer adds static profile data to a branch---i.e. uses
"__builtin_expect()" or similar---then we should honor it. Otherwise,
"__builtin_expect()" is ignored in crucial situations. So we trust that
the programmer knows what they're doing until proven wrong.

Subscribers: hiraditya, JDevlieghere, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D74809

Added: 
    llvm/test/CodeGen/X86/block-placement-2.ll

Modified: 
    llvm/include/llvm/CodeGen/MachineLoopInfo.h
    llvm/lib/CodeGen/MachineBlockPlacement.cpp
    llvm/lib/CodeGen/MachineLoopInfo.cpp
    llvm/test/CodeGen/Hexagon/prof-early-if.ll
    llvm/test/CodeGen/X86/block-placement.ll
    llvm/test/CodeGen/X86/move_latch_to_loop_top.ll
    llvm/test/CodeGen/X86/ragreedy-bug.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index 8a93f91ae54d..7d2273827b3f 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -67,6 +67,10 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
   /// it returns an unknown location.
   DebugLoc getStartLoc() const;
 
+  /// Returns true if a machine loop has blocks that have static profiling
+  /// information---e.g. from '__builtin_expect()'.
+  bool hasStaticProfInfo() const;
+
   void dump() const;
 
 private:

diff  --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 31491513bd96..5ce43906dea1 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2506,9 +2506,14 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   // its frequency and the frequency of the loop block. When it is too small,
   // don't add it to the loop chain. If there are outer loops, then this block
   // will be merged into the first outer loop chain for which this block is not
-  // cold anymore. This needs precise profile data and we only do this when
-  // profile data is available.
-  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
+  // cold anymore.
+  //
+  // If a block uses static profiling data (e.g. from '__builtin_expect()'),
+  // then the programmer is explicitly telling us which paths are hot and cold.
+  // There's no reason for the compiler to believe otherwise, unless
+  // '-fprofile-use' is specified.
+  if (F->getFunction().hasProfileData() || ForceLoopColdBlock ||
+      L.hasStaticProfInfo()) {
     BlockFrequency LoopFreq(0);
     for (auto LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))

diff  --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 0c1439da9b29..67916fbe722e 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -111,6 +111,13 @@ DebugLoc MachineLoop::getStartLoc() const {
   return DebugLoc();
 }
 
+bool MachineLoop::hasStaticProfInfo() const {
+  return llvm::any_of(blocks(), [](const MachineBasicBlock *MBB){
+    const BasicBlock *BB = MBB->getBasicBlock();
+    return BB && BB->getTerminator()->hasMetadata(LLVMContext::MD_prof);
+  });
+}
+
 MachineBasicBlock *
 MachineLoopInfo::findLoopPreheader(MachineLoop *L,
                                    bool SpeculativePreheader) const {

diff  --git a/llvm/test/CodeGen/Hexagon/prof-early-if.ll b/llvm/test/CodeGen/Hexagon/prof-early-if.ll
index b0f21110b7de..a5215a9b351f 100644
--- a/llvm/test/CodeGen/Hexagon/prof-early-if.ll
+++ b/llvm/test/CodeGen/Hexagon/prof-early-if.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
 ; Rely on the comments generated by llc. Check that "if.then" was not predicated.
-; CHECK: b5
 ; CHECK: b2
 ; CHECK-NOT: if{{.*}}memd
+; CHECK: b5
 
 %s.0 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %s.1], [5 x i32] }
 %s.1 = type { i32, i32 }

diff  --git a/llvm/test/CodeGen/X86/block-placement-2.ll b/llvm/test/CodeGen/X86/block-placement-2.ll
new file mode 100644
index 000000000000..2006dec266d5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/block-placement-2.ll
@@ -0,0 +1,162 @@
+
+; RUN: llc -mtriple=i686-linux -pre-RA-sched=source < %s | FileCheck %s
+; RUN: opt -disable-output -debugify < %s
+
+; This was derived from the Linux kernel. The __builtin_expect was ignored
+; which pushed the hot block "if.else" out of the critical path choosing
+; instead the cold block "if.then23". The cold block should be moved towards
+; the bottom.
+
+; CHECK-LABEL: test1:
+; CHECK:       %for.inc
+; CHECK:       %if.end18
+; CHECK:       %if.else
+; CHECK:       %if.end.i.i
+; CHECK:       %if.end8.i.i
+; CHECK:       %if.then23
+; CHECK:       ret
+
+%struct.hlist_bl_node = type { %struct.hlist_bl_node*, %struct.hlist_bl_node** }
+%struct.dentry = type { i32, %struct.inode, %struct.hlist_bl_node, %struct.dentry*, %struct.inode, %struct.inode*, [32 x i8], %struct.inode, %struct.dentry_operations* }
+%struct.inode = type { i32 }
+%struct.dentry_operations = type { i32 (%struct.dentry*, i32)*, i32 (%struct.dentry*, i32)*, i32 (%struct.dentry*, %struct.inode*)*, i32 (%struct.dentry*, i32, i8*)* }
+%struct.anon.2 = type { i32, i32 }
+
+define %struct.dentry* @test1(%struct.dentry* readonly %parent, i8* %name, i32* nocapture %seqp, i64 %param1) {
+entry:
+  %tobool135 = icmp eq i64 %param1, 0
+  br i1 %tobool135, label %cleanup63, label %do.body4.lr.ph
+
+do.body4.lr.ph:                                   ; preds = %entry
+  %d_op = getelementptr inbounds %struct.dentry, %struct.dentry* %parent, i64 0, i32 8
+  %shr = lshr i64 %param1, 32
+  %conv49 = trunc i64 %shr to i32
+  br label %do.body4
+
+do.body4:                                         ; preds = %for.inc, %do.body4.lr.ph
+  %node.0.in136 = phi i64 [ %param1, %do.body4.lr.ph ], [ %tmp35, %for.inc ]
+  %node.0 = inttoptr i64 %node.0.in136 to %struct.hlist_bl_node*
+  %add.ptr = getelementptr %struct.hlist_bl_node, %struct.hlist_bl_node* %node.0, i64 -1, i32 1
+  %tmp6 = bitcast %struct.hlist_bl_node*** %add.ptr to %struct.dentry*
+  %tmp7 = getelementptr inbounds %struct.dentry, %struct.dentry* %tmp6, i64 0, i32 1, i32 0
+  %tmp8 = load volatile i32, i32* %tmp7, align 4
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
+  %d_parent = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 3
+  %tmp9 = bitcast %struct.hlist_bl_node*** %d_parent to %struct.dentry**
+  %tmp10 = load %struct.dentry*, %struct.dentry** %tmp9, align 8
+  %cmp133 = icmp eq %struct.dentry* %tmp10, %parent
+  br i1 %cmp133, label %if.end14.lr.ph, label %for.inc
+
+if.end14.lr.ph:                                   ; preds = %do.body4
+  %tmp11 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 2
+  %d_name43 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 4
+  %hash = bitcast %struct.hlist_bl_node*** %d_name43 to i32*
+  %tmp12 = bitcast %struct.hlist_bl_node*** %d_name43 to %struct.anon.2*
+  %len = getelementptr inbounds %struct.anon.2, %struct.anon.2* %tmp12, i64 0, i32 1
+  %name31 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 5
+  %tmp13 = bitcast %struct.hlist_bl_node*** %name31 to i8**
+  br label %if.end14
+
+if.end14:                                         ; preds = %cleanup, %if.end14.lr.ph
+  %and.i100134.in = phi i32 [ %tmp8, %if.end14.lr.ph ], [ undef, %cleanup ]
+  %and.i100134 = and i32 %and.i100134.in, -2
+  %tmp14 = load %struct.hlist_bl_node**, %struct.hlist_bl_node*** %tmp11, align 8
+  %tobool.i.i = icmp eq %struct.hlist_bl_node** %tmp14, null
+  br i1 %tobool.i.i, label %for.inc, label %if.end18
+
+if.end18:                                         ; preds = %if.end14
+  %tmp15 = load i32, i32* %seqp, align 8
+  %tmp16 = and i32 %tmp15, 2
+  %tobool22 = icmp eq i32 %tmp16, 0
+  br i1 %tobool22, label %if.else, label %if.then23, !prof !0, !misexpect !1
+
+if.then23:                                        ; preds = %if.end18
+  %tmp17 = load i32, i32* %hash, align 8
+  %cmp25 = icmp eq i32 %tmp17, 42
+  br i1 %cmp25, label %if.end28, label %for.inc
+
+if.end28:                                         ; preds = %if.then23
+  %tmp18 = load i32, i32* %len, align 4
+  %tmp19 = load i8*, i8** %tmp13, align 8
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
+  %tmp20 = load i32, i32* %tmp7, align 4
+  %cmp.i.i101 = icmp eq i32 %tmp20, %and.i100134
+  br i1 %cmp.i.i101, label %if.end36, label %cleanup
+
+if.end36:                                         ; preds = %if.end28
+  %tmp21 = load %struct.dentry_operations*, %struct.dentry_operations** %d_op, align 8
+  %d_compare = getelementptr inbounds %struct.dentry_operations, %struct.dentry_operations* %tmp21, i64 0, i32 3
+  %tmp22 = load i32 (%struct.dentry*, i32, i8*)*, i32 (%struct.dentry*, i32, i8*)** %d_compare, align 8
+  %call37 = call i32 %tmp22(%struct.dentry* %tmp6, i32 %tmp18, i8* %name)
+  %cmp38 = icmp eq i32 %call37, 0
+  br i1 %cmp38, label %cleanup56, label %for.inc
+
+cleanup:                                          ; preds = %if.end28
+  %tmp24 = load %struct.dentry*, %struct.dentry** %tmp9, align 8
+  %cmp = icmp eq %struct.dentry* null, %parent
+  br i1 %cmp, label %if.end14, label %for.inc
+
+if.else:                                          ; preds = %if.end18
+  %hash_len44 = bitcast %struct.hlist_bl_node*** %d_name43 to i64*
+  %tmp25 = load i64, i64* %hash_len44, align 8
+  %cmp45 = icmp eq i64 %tmp25, %param1
+  br i1 %cmp45, label %if.end48, label %for.inc
+
+if.end48:                                         ; preds = %if.else
+  %tmp26 = bitcast %struct.hlist_bl_node*** %name31 to i64*
+  %tmp27 = load volatile i64, i64* %tmp26, align 8
+  %tmp28 = inttoptr i64 %tmp27 to i8*
+  br label %for.cond.i.i
+
+for.cond.i.i:                                     ; preds = %if.end8.i.i, %if.end48
+  %tcount.addr.0.i.i = phi i32 [ %conv49, %if.end48 ], [ %sub.i.i, %if.end8.i.i ]
+  %ct.addr.0.i.i = phi i8* [ %name, %if.end48 ], [ %add.ptr9.i.i, %if.end8.i.i ]
+  %cs.addr.0.i.i = phi i8* [ %tmp28, %if.end48 ], [ %add.ptr.i.i, %if.end8.i.i ]
+  %tmp29 = bitcast i8* %cs.addr.0.i.i to i64*
+  %tmp30 = load i64, i64* %tmp29, align 8
+  %tmp31 = bitcast i8* %ct.addr.0.i.i to i64*
+  %tmp32 = call { i64, i64 } asm "1:\09mov $2,$0\0A2:\0A.section .fixup,\22ax\22\0A3:\09lea $2,$1\0A\09and $3,$1\0A\09mov ($1),$0\0A\09leal $2,%ecx\0A\09andl $4,%ecx\0A\09shll $$3,%ecx\0A\09shr %cl,$0\0A\09jmp 2b\0A.previous\0A .pushsection \22__ex_table\22,\22a\22\0A .balign 4\0A .long (1b) - .\0A .long (3b) - .\0A .long (ex_handler_default) - .\0A .popsection\0A", "=&r,=&{cx},*m,i,i,~{dirflag},~{fpsr},~{flags}"(i64* %tmp31, i64 -8, i64 7)
+  %cmp.i.i = icmp ult i32 %tcount.addr.0.i.i, 8
+  %asmresult.i.le.i.le.i.le = extractvalue { i64, i64 } %tmp32, 0
+  br i1 %cmp.i.i, label %dentry_cmp.exit, label %if.end.i.i
+
+if.end.i.i:                                       ; preds = %for.cond.i.i
+  %cmp3.i.i = icmp eq i64 %tmp30, %asmresult.i.le.i.le.i.le
+  br i1 %cmp3.i.i, label %if.end8.i.i, label %for.inc, !prof !0, !misexpect !1
+
+if.end8.i.i:                                      ; preds = %if.end.i.i
+  %add.ptr.i.i = getelementptr i8, i8* %cs.addr.0.i.i, i64 8
+  %add.ptr9.i.i = getelementptr i8, i8* %ct.addr.0.i.i, i64 8
+  %sub.i.i = add i32 %tcount.addr.0.i.i, -8
+  %tobool12.i.i = icmp eq i32 %sub.i.i, 0
+  br i1 %tobool12.i.i, label %cleanup56, label %for.cond.i.i
+
+dentry_cmp.exit:                                  ; preds = %for.cond.i.i
+  %asmresult.i.le.i.le.i.le.le = extractvalue { i64, i64 } %tmp32, 0
+  %mul.i.i = shl nuw nsw i32 %tcount.addr.0.i.i, 3
+  %sh_prom.i.i = zext i32 %mul.i.i to i64
+  %shl.i.i = shl nsw i64 -1, %sh_prom.i.i
+  %neg.i.i = xor i64 %shl.i.i, -1
+  %xor.i.i = xor i64 %asmresult.i.le.i.le.i.le.le, %tmp30
+  %and.i.i = and i64 %xor.i.i, %neg.i.i
+  %tobool15.i.i = icmp eq i64 %and.i.i, 0
+  br i1 %tobool15.i.i, label %cleanup56, label %for.inc
+
+cleanup56:                                        ; preds = %dentry_cmp.exit, %if.end8.i.i, %if.end36
+  %tmp33 = bitcast %struct.hlist_bl_node*** %add.ptr to %struct.dentry*
+  store i32 %and.i100134, i32* %seqp, align 4
+  br label %cleanup63
+
+for.inc:                                          ; preds = %dentry_cmp.exit, %if.end.i.i, %if.else, %cleanup, %if.end36, %if.then23, %if.end14, %do.body4
+  %tmp34 = inttoptr i64 %node.0.in136 to i64*
+  %tmp35 = load volatile i64, i64* %tmp34, align 8
+  %tobool = icmp eq i64 %tmp35, 0
+  br i1 %tobool, label %cleanup63, label %do.body4
+
+cleanup63:                                        ; preds = %for.inc, %cleanup56, %entry
+  %retval.2 = phi %struct.dentry* [ %tmp33, %cleanup56 ], [ null, %entry ], [ null, %for.inc ]
+  ret %struct.dentry* %retval.2
+}
+
+!0 = !{!"branch_weights", i32 2000, i32 1}
+!1 = !{!"misexpect", i64 1, i64 2000, i64 1}

diff  --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 258cc2031ae8..6c93bcbdff2a 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -1502,9 +1502,9 @@ define i32 @not_rotate_if_extra_branch(i32 %count) {
 ; CHECK: %.header
 ; CHECK: %.middle
 ; CHECK: %.backedge
-; CHECK: %.slow
 ; CHECK: %.bailout
 ; CHECK: %.stop
+; CHECK: %.slow
 .entry:
   %sum.0 = shl nsw i32 %count, 1
   br label %.header

diff  --git a/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll b/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll
index d86ec9c8129d..0141593ce0f1 100644
--- a/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll
+++ b/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll
@@ -173,8 +173,8 @@ exit:
 ;CHECK:       %header
 ;CHECK:       %true
 ;CHECK:       %latch
-;CHECK:       %false
 ;CHECK:       %exit
+;CHECK:       %false
 define i32 @test4(i32 %t, i32* %p) {
 entry:
   br label %header

diff  --git a/llvm/test/CodeGen/X86/ragreedy-bug.ll b/llvm/test/CodeGen/X86/ragreedy-bug.ll
index 7a7c98fba4eb..7a82459db00f 100644
--- a/llvm/test/CodeGen/X86/ragreedy-bug.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-bug.ll
@@ -10,6 +10,8 @@
 ; Mem-move
 ; CHECK-NEXT: movl
 ; CHECK-NEXT: andl
+; CHECK-NEXT: LBB0
+; CHECK-NEXT: in Loop
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: jne
 ; CHECK: cond.true.i.i217
@@ -17,20 +19,20 @@
 ; Mem-move
 ; CHECK-NEXT: movl
 ; CHECK-NEXT: andl
+; CHECK-NEXT: LBB0
+; CHECK-NEXT: in Loop
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: je
 ; CHECK: cond.false.i.i
 ; CHECK: maskrune
 ; CHECK-NEXT: movzbl
 ; CHECK-NEXT: movzbl
-; CHECK-NEXT: testl
-; CHECK-NEXT: je
+; CHECK-NEXT: jmp
 ; CHECK: cond.false.i.i219
 ; CHECK: maskrune
 ; CHECK-NEXT: movzbl
 ; CHECK-NEXT: movzbl
-; CHECK-NEXT: testl
-; CHECK-NEXT: jne
+; CHECK-NEXT: jmp
 
 %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
 %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }


        


More information about the llvm-commits mailing list