[llvm] r263969 - AMDGPU/SI: Fix threshold calculation for branching when exec is zero

Mon Mar 21 11:57:00 PDT 2016

Author: tstellar
Date: Mon Mar 21 13:56:58 2016
New Revision: 263969

URL: http://llvm.org/viewvc/llvm-project?rev=263969&view=rev
Log:
AMDGPU/SI: Fix threshold calculation for branching when exec is zero

Summary:
When control flow is implemented using the exec mask, the compiler will
insert branch instructions to skip over the masked section when exec is
zero if the section contains more than a certain number of instructions.

The previous code would only count instructions in successor blocks,
and this patch modifies the code to start counting instructions in all
blocks between the start and end of the branch.

Reviewers: nhaehnle, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18282

Modified:
    llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
    llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp?rev=263969&r1=263968&r2=263969&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp Mon Mar 21 13:56:58 2016
@@ -130,10 +130,12 @@ bool SILowerControlFlow::shouldSkip(Mach
 
   unsigned NumInstr = 0;
 
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
+  for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),
+                                 ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) {
 
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
 
       if (I->isBundle() || !I->isBundled()) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll?rev=263969&r1=263968&r2=263969&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll Mon Mar 21 13:56:58 2016
@@ -24,5 +24,39 @@ out:
   ret void
 }
 
+;CHECK-LABEL: {{^}}test2:
+;CHECK: s_and_saveexec_b64
+;CHECK: s_xor_b64
+;CHECK-NEXT: s_cbranch_execz
+define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %cc = icmp eq i32 %tid, 0
+  br i1 %cc, label %done1, label %if
+
+if:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %done0, label %loop_body
+
+loop_body:
+  %counter = phi i32 [ 0, %if ], [0, %done0], [ %incr, %loop_body ]
+
+  ; Prevent the loop from being optimized out
+  call void asm sideeffect "", "" ()
+
+  %incr = add i32 %counter, 1
+  %lc = icmp sge i32 %incr, 1000
+  br i1 %lc, label %done1, label %loop_body
+
+done0:
+  %cmp0 = icmp eq i32 %b, 0
+  br i1 %cmp0, label %done1, label %loop_body
+
+done1:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readonly }