[llvm] 2e87ed8 - [AMDGPU] WQM: Allow insertion of exact mode transition as terminator

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 1 22:01:36 PDT 2023


Author: Carl Ritson
Date: 2023-06-02T14:00:54+09:00
New Revision: 2e87ed80b23ad5c30a85762aa879739fa1e9cea0

URL: https://github.com/llvm/llvm-project/commit/2e87ed80b23ad5c30a85762aa879739fa1e9cea0
DIFF: https://github.com/llvm/llvm-project/commit/2e87ed80b23ad5c30a85762aa879739fa1e9cea0.diff

LOG: [AMDGPU] WQM: Allow insertion of exact mode transition as terminator

Allow WQM pass to insert transitions to exact mode among block
terminators, instead of forcing them to occur before terminators.

This should not yield any functional change, but allows block
splitting of control flow, such as that in D145329.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D151797

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3ddd0b3b8389..2c3f0ebae21c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1953,6 +1953,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_AND_B32));
     break;
 
+  case AMDGPU::S_AND_SAVEEXEC_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
+    break;
+
+  case AMDGPU::S_AND_SAVEEXEC_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2730,11 +2742,13 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
     case AMDGPU::S_OR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
     case AMDGPU::S_AND_B64_term:
+    case AMDGPU::S_AND_SAVEEXEC_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
     case AMDGPU::S_AND_B32_term:
+    case AMDGPU::S_AND_SAVEEXEC_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 31914cc07eea..17fbce3e2edc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -306,6 +306,7 @@ def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
 def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
+def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
@@ -314,6 +315,7 @@ def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
 def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
+def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>;
 }
 
 

diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index e22b950ce897..7b5f91e922d8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -158,10 +158,11 @@ class SIWholeQuadMode : public MachineFunctionPass {
   MachinePostDominatorTree *PDT;
 
   unsigned AndOpc;
+  unsigned AndTermOpc;
   unsigned AndN2Opc;
   unsigned XorOpc;
   unsigned AndSaveExecOpc;
-  unsigned OrSaveExecOpc;
+  unsigned AndSaveExecTermOpc;
   unsigned WQMOpc;
   Register Exec;
   Register LiveMaskReg;
@@ -1206,13 +1207,25 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
                               Register SaveWQM) {
+  bool IsTerminator = Before == MBB.end();
+  if (!IsTerminator) {
+    auto FirstTerm = MBB.getFirstTerminator();
+    if (FirstTerm != MBB.end()) {
+      SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
+      SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
+      IsTerminator = BeforeIdx > FirstTermIdx;
+    }
+  }
+
   MachineInstr *MI;
 
   if (SaveWQM) {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
+    unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
              .addReg(LiveMaskReg);
   } else {
-    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
+    unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
              .addReg(Exec)
              .addReg(LiveMaskReg);
   }
@@ -1365,9 +1378,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
         Needs = StateExact | StateWQM | StateStrict;
       }
 
-      if (MI.isTerminator() && OutNeeds == StateExact)
-        Needs = StateExact;
-
       ++Next;
     } else {
       // End of basic block
@@ -1591,18 +1601,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   if (ST->isWave32()) {
     AndOpc = AMDGPU::S_AND_B32;
+    AndTermOpc = AMDGPU::S_AND_B32_term;
     AndN2Opc = AMDGPU::S_ANDN2_B32;
     XorOpc = AMDGPU::S_XOR_B32;
     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
-    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
     WQMOpc = AMDGPU::S_WQM_B32;
     Exec = AMDGPU::EXEC_LO;
   } else {
     AndOpc = AMDGPU::S_AND_B64;
+    AndTermOpc = AMDGPU::S_AND_B64_term;
     AndN2Opc = AMDGPU::S_ANDN2_B64;
     XorOpc = AMDGPU::S_XOR_B64;
     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
-    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+    AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
     WQMOpc = AMDGPU::S_WQM_B64;
     Exec = AMDGPU::EXEC;
   }


        


More information about the llvm-commits mailing list