[llvm] 790c75c - [AMDGPU] Add SI_EARLY_TERMINATE_SCC0 for early terminating shader

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 12 20:29:41 PST 2021


Author: Carl Ritson
Date: 2021-01-13T13:29:05+09:00
New Revision: 790c75c16373d37846c8433a69efd9b0d5e4ad12

URL: https://github.com/llvm/llvm-project/commit/790c75c16373d37846c8433a69efd9b0d5e4ad12
DIFF: https://github.com/llvm/llvm-project/commit/790c75c16373d37846c8433a69efd9b0d5e4ad12.diff

LOG: [AMDGPU] Add SI_EARLY_TERMINATE_SCC0 for early terminating shader

Add pseudo instruction to allow early termination of pixel shader
anywhere based on the value of SCC.  The intention is to use this
when a mask of live lanes is updated, e.g. live lanes in WQM pass.
This facilitates early termination of shaders even when EXEC is
incomplete, e.g. in non-uniform control flow.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D88777

Added: 
    llvm/test/CodeGen/AMDGPU/early-term.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
    llvm/lib/Target/AMDGPU/SIInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index eb2e12f2dcda..e80325bddc43 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -49,6 +49,7 @@ class SIInsertSkips : public MachineFunctionPass {
                   DebugLoc DL);
 
   bool kill(MachineInstr &MI);
+  void earlyTerm(MachineInstr &MI);
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
@@ -145,19 +146,22 @@ bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
   return true;
 }
 
-static void generatePsEndPgm(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I, DebugLoc DL,
-                             const SIInstrInfo *TII) {
-  // Generate "null export; s_endpgm".
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
-      .addImm(AMDGPU::Exp::ET_NULL)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addReg(AMDGPU::VGPR0, RegState::Undef)
-      .addImm(1)  // vm
-      .addImm(0)  // compr
-      .addImm(0); // en
+static void generateEndPgm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           const SIInstrInfo *TII, bool IsPS) {
+  // "null export"
+  if (IsPS) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+        .addImm(AMDGPU::Exp::ET_NULL)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addReg(AMDGPU::VGPR0, RegState::Undef)
+        .addImm(1)  // vm
+        .addImm(0)  // compr
+        .addImm(0); // en
+  }
+  // s_endpgm
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
 }
 
@@ -169,7 +173,9 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
   if (!EarlyExitBlock) {
     EarlyExitBlock = MF->CreateMachineBasicBlock();
     MF->insert(MF->end(), EarlyExitBlock);
-    generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
+    generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII,
+                   MF->getFunction().getCallingConv() ==
+                       CallingConv::AMDGPU_PS);
     EarlyExitClearsExec = false;
   }
 
@@ -178,7 +184,6 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB,
     unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     auto ExitI = EarlyExitBlock->getFirstNonPHI();
-    assert(ExitI->getOpcode() == AMDGPU::EXP_DONE);
     BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
     EarlyExitClearsExec = true;
   }
@@ -224,7 +229,7 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
       I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
 
   if (NoSuccessor) {
-    generatePsEndPgm(MBB, I, DL, TII);
+    generateEndPgm(MBB, I, DL, TII, true);
   } else {
     ensureEarlyExitBlock(MBB, false);
 
@@ -368,6 +373,23 @@ bool SIInsertSkips::kill(MachineInstr &MI) {
   }
 }
 
+void SIInsertSkips::earlyTerm(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc DL = MI.getDebugLoc();
+
+  ensureEarlyExitBlock(MBB, true);
+
+  auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
+                      .addMBB(EarlyExitBlock);
+  auto Next = std::next(MI.getIterator());
+
+  if (Next != MBB.end() && !Next->isTerminator())
+    splitBlock(MBB, *BranchMI, MDT);
+
+  MBB.addSuccessor(EarlyExitBlock);
+  MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+}
+
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
@@ -393,6 +415,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   SkipThreshold = SkipThresholdFlag;
 
   SmallVector<MachineInstr *, 4> KillInstrs;
+  SmallVector<MachineInstr *, 4> EarlyTermInstrs;
   bool MadeChange = false;
 
   for (MachineBasicBlock &MBB : MF) {
@@ -451,18 +474,29 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
 
+      case AMDGPU::SI_EARLY_TERMINATE_SCC0:
+        EarlyTermInstrs.push_back(&MI);
+        break;
+
       default:
         break;
       }
     }
   }
 
+  for (MachineInstr *Instr : EarlyTermInstrs) {
+    // Early termination in GS does nothing
+    if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
+      earlyTerm(*Instr);
+    Instr->eraseFromParent();
+  }
   for (MachineInstr *Kill : KillInstrs) {
     skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
                Kill->getDebugLoc());
     Kill->eraseFromParent();
   }
   KillInstrs.clear();
+  EarlyTermInstrs.clear();
   EarlyExitBlock = nullptr;
 
   return MadeChange;

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 82b1039002fe..43bd80c62d80 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -321,6 +321,14 @@ def SI_IF_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
+// Branch to the early termination block of the shader if SCC is 0.
+// This uses SCC from a previous SALU operation, i.e. the update of
+// a mask of live lanes after a kill/demote operation.
+// Only valid in pixel shaders.
+def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> {
+  let Uses = [EXEC,SCC];
+}
+
 let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {

diff  --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir
new file mode 100644
index 000000000000..361a4ebefcc6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/early-term.mir
@@ -0,0 +1,268 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-insert-skips -verify-machineinstrs  %s -o - | FileCheck %s
+
+--- |
+  define amdgpu_ps void @early_term_scc0_end_block() {
+    ret void
+  }
+
+  define amdgpu_ps void @early_term_scc0_next_terminator() {
+    ret void
+  }
+
+  define amdgpu_ps void @early_term_scc0_in_block() {
+    ret void
+  }
+
+  define amdgpu_ps void @early_term_scc0_with_kill() {
+    ret void
+  }
+
+  define amdgpu_gs void @early_term_scc0_gs() {
+    ret void
+  }
+
+  define amdgpu_cs void @early_term_scc0_cs() {
+    ret void
+  }
+...
+
+---
+name: early_term_scc0_end_block
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_end_block
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK: bb.1:
+  ; CHECK:   liveins: $vgpr0
+  ; CHECK:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  ; CHECK: bb.2:
+  ; CHECK:   $exec_lo = S_MOV_B32 0
+  ; CHECK:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    successors: %bb.1
+
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+
+  bb.1:
+    liveins: $vgpr0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: early_term_scc0_next_terminator
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_next_terminator
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.2(0x80000000), %bb.3(0x00000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK:   S_CBRANCH_SCC0 %bb.3, implicit $scc
+  ; CHECK:   S_BRANCH %bb.2
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK: bb.2:
+  ; CHECK:   liveins: $vgpr0
+  ; CHECK:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  ; CHECK: bb.3:
+  ; CHECK:   $exec_lo = S_MOV_B32 0
+  ; CHECK:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    successors: %bb.2
+
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+    $vgpr0 = V_MOV_B32_e32 1, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    liveins: $vgpr0
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: early_term_scc0_in_block
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_in_block
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK: bb.3:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $vgpr0, $scc
+  ; CHECK:   $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK: bb.1:
+  ; CHECK:   liveins: $vgpr0, $vgpr1
+  ; CHECK:   EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+  ; CHECK:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  ; CHECK: bb.2:
+  ; CHECK:   $exec_lo = S_MOV_B32 0
+  ; CHECK:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    successors: %bb.1
+
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+
+  bb.1:
+    liveins: $vgpr0, $vgpr1
+    EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: early_term_scc0_with_kill
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+  - { reg: '$vgpr2' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_with_kill
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000), %bb.3(0x00000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1, $vgpr2
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   V_CMPX_LE_F32_nosdst_e32 0, killed $vgpr2, implicit-def $exec, implicit $mode, implicit $exec
+  ; CHECK:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1, $vgpr0
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK:   S_CBRANCH_SCC0 %bb.3, implicit $scc
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $vgpr0, $scc
+  ; CHECK:   $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+  ; CHECK: bb.2:
+  ; CHECK:   liveins: $vgpr0, $vgpr1
+  ; CHECK:   EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+  ; CHECK:   EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  ; CHECK: bb.3:
+  ; CHECK:   $exec_lo = S_MOV_B32 0
+  ; CHECK:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr2
+    successors: %bb.1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    SI_KILL_F32_COND_IMM_TERMINATOR killed $vgpr2, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+
+  bb.1:
+    liveins: $sgpr0, $sgpr1, $vgpr0
+    successors: %bb.2
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+
+  bb.2:
+    liveins: $vgpr0, $vgpr1
+    EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name: early_term_scc0_gs
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_gs
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK: bb.1:
+  ; CHECK:   liveins: $vgpr0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    successors: %bb.1
+
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+
+  bb.1:
+    liveins: $vgpr0
+    S_ENDPGM 0
+...
+
+---
+name: early_term_scc0_cs
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr0' }
+  - { reg: '$sgpr1' }
+body: |
+  ; CHECK-LABEL: name: early_term_scc0_cs
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; CHECK:   liveins: $sgpr0, $sgpr1
+  ; CHECK:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK:   dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+  ; CHECK:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK: bb.1:
+  ; CHECK:   liveins: $vgpr0
+  ; CHECK:   S_ENDPGM 0
+  ; CHECK: bb.2:
+  ; CHECK:   $exec_lo = S_MOV_B32 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    successors: %bb.1
+
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
+    SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
+
+  bb.1:
+    liveins: $vgpr0
+    S_ENDPGM 0
+...


        


More information about the llvm-commits mailing list