[llvm] [AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec (PR #93680)

Wed May 29 06:15:50 PDT 2024

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/93680

- **[AMDGPU] Reduce use of continue in SIWholeQuadMode. NFC.**
- **[AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow to SIWholeQuadMode**
- **[AMDGPU] New test for WQM and llvm.amdgcn.init.exec**
- **[AMDGPU] Fix interaction between WQM and llvm.amdgcn.init.exec**


>From d9d4ba67399ff0e7bb2604a51758d0ab3c5ff770 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 10:42:40 +0100
Subject: [PATCH 1/4] [AMDGPU] Reduce use of continue in SIWholeQuadMode. NFC.

---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 92 +++++++++-------------
 1 file changed, 36 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ea8109bbee9ae..09dc1c781e2f3 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -278,11 +278,10 @@ LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 
     for (const MachineInstr &MI : *BII.first) {
       auto III = Instructions.find(&MI);
-      if (III == Instructions.end())
-        continue;
-
-      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
-             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      if (III != Instructions.end()) {
+        dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
+               << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      }
     }
   }
 }
@@ -455,10 +454,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
       LiveRange &LR = LIS->getRegUnit(Unit);
       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
-      if (!Value)
-        continue;
-
-      markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+      if (Value)
+        markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
     }
   }
 }
@@ -499,19 +496,16 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
       if (TII->isWQM(Opcode)) {
         // If LOD is not supported WQM is not needed.
-        if (!ST->hasExtendedImageInsts())
-          continue;
         // Only generate implicit WQM if implicit derivatives are required.
         // This avoids inserting unintended WQM if a shader type without
         // implicit derivatives uses an image sampling instruction.
-        if (!HasImplicitDerivatives)
-          continue;
-        // Sampling instructions don't need to produce results for all pixels
-        // in a quad, they just require all inputs of a quad to have been
-        // computed for derivatives.
-        markInstructionUses(MI, StateWQM, Worklist);
-        GlobalFlags |= StateWQM;
-        continue;
+        if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
+          // Sampling instructions don't need to produce results for all pixels
+          // in a quad, they just require all inputs of a quad to have been
+          // computed for derivatives.
+          markInstructionUses(MI, StateWQM, Worklist);
+          GlobalFlags |= StateWQM;
+        }
       } else if (Opcode == AMDGPU::WQM) {
         // The WQM intrinsic requires its output to have all the helper lanes
         // correct, so we need it to be in WQM.
@@ -520,7 +514,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       } else if (Opcode == AMDGPU::SOFT_WQM) {
         LowerToCopyInstrs.push_back(&MI);
         SoftWQMInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WWM) {
         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
         // it needs to be executed in WQM or Exact so that its copy doesn't
@@ -528,7 +521,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         markInstructionUses(MI, StateStrictWWM, Worklist);
         GlobalFlags |= StateStrictWWM;
         LowerToMovInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WQM ||
                  TII->isDualSourceBlendEXP(MI)) {
         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
@@ -551,7 +543,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           GlobalFlags |= StateExact;
           III.Disabled = StateWQM | StateStrict;
         }
-        continue;
       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
                  Opcode == AMDGPU::DS_PARAM_LOAD ||
                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
@@ -561,7 +552,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         InstrInfo &II = Instructions[&MI];
         II.Needs |= StateStrictWQM;
         GlobalFlags |= StateStrictWQM;
-        continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
         III.Disabled = StateStrict;
@@ -574,7 +564,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
-        continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -583,40 +572,33 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateStrict;
-        continue;
-      } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
-          LiveMaskQueries.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
-                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
-                   Opcode == AMDGPU::SI_DEMOTE_I1) {
-          KillInstrs.push_back(&MI);
-          BBI.NeedsLowering = true;
-        } else if (WQMOutputs) {
-          // The function is in machine SSA form, which means that physical
-          // VGPRs correspond to shader inputs and outputs. Inputs are
-          // only used, outputs are only defined.
-          // FIXME: is this still valid?
-          for (const MachineOperand &MO : MI.defs()) {
-            if (!MO.isReg())
-              continue;
-
-            Register Reg = MO.getReg();
-
-            if (!Reg.isVirtual() &&
-                TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
-              Flags = StateWQM;
-              break;
-            }
+      } else if (Opcode == AMDGPU::SI_PS_LIVE ||
+                 Opcode == AMDGPU::SI_LIVE_MASK) {
+        LiveMaskQueries.push_back(&MI);
+      } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+                 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+                 Opcode == AMDGPU::SI_DEMOTE_I1) {
+        KillInstrs.push_back(&MI);
+        BBI.NeedsLowering = true;
+      } else if (WQMOutputs) {
+        // The function is in machine SSA form, which means that physical
+        // VGPRs correspond to shader inputs and outputs. Inputs are
+        // only used, outputs are only defined.
+        // FIXME: is this still valid?
+        for (const MachineOperand &MO : MI.defs()) {
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical() &&
+              TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
+            Flags = StateWQM;
+            break;
           }
         }
-
-        if (!Flags)
-          continue;
       }
 
-      markInstruction(MI, Flags, Worklist);
-      GlobalFlags |= Flags;
+      if (Flags) {
+        markInstruction(MI, Flags, Worklist);
+        GlobalFlags |= Flags;
+      }
     }
   }
 
@@ -1568,8 +1550,6 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(*MBB, *MI);
       break;
-    default:
-      continue;
     }
     if (SplitPoint)
       splitBlock(MBB, SplitPoint);

>From 5f5282e328cb2843728f55195a79559f4e5728f7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:04:52 +0100
Subject: [PATCH 2/4] [AMDGPU] Move INIT_EXEC lowering from SILowerControlFlow
 to SIWholeQuadMode

NFCI; this just preserves SI_INIT_EXEC and SI_INIT_EXEC_FROM_INPUT
instructions a little longer so that we can reliably identify them in
SIWholeQuadMode.
---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 103 ------------------
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    | 103 +++++++++++++++++-
 2 files changed, 102 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f178324dbbe24..5dc3457b5bfae 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,8 +103,6 @@ class SILowerControlFlow : public MachineFunctionPass {
 
   MachineBasicBlock *emitEndCf(MachineInstr &MI);
 
-  void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
-
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
@@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
   return SplitBB;
 }
 
-void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
-                                       MachineInstr &MI) {
-  MachineFunction &MF = *MBB->getParent();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  bool IsWave32 = ST.isWave32();
-
-  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
-    // This should be before all vector instructions.
-    MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-            TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
-        .addImm(MI.getOperand(0).getImm());
-    if (LIS) {
-      LIS->RemoveMachineInstrFromMaps(MI);
-      LIS->InsertMachineInstrInMaps(*InitMI);
-    }
-    MI.eraseFromParent();
-    return;
-  }
-
-  // Extract the thread count from an SGPR input and set EXEC accordingly.
-  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
-  //
-  // S_BFE_U32 count, input, {shift, 7}
-  // S_BFM_B64 exec, count, 0
-  // S_CMP_EQ_U32 count, 64
-  // S_CMOV_B64 exec, -1
-  Register InputReg = MI.getOperand(0).getReg();
-  MachineInstr *FirstMI = &*MBB->begin();
-  if (InputReg.isVirtual()) {
-    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
-    assert(DefInstr && DefInstr->isCopy());
-    if (DefInstr->getParent() == MBB) {
-      if (DefInstr != FirstMI) {
-        // If the `InputReg` is defined in current block, we also need to
-        // move that instruction to the beginning of the block.
-        DefInstr->removeFromParent();
-        MBB->insert(FirstMI, DefInstr);
-        if (LIS)
-          LIS->handleMove(*DefInstr);
-      } else {
-        // If first instruction is definition then move pointer after it.
-        FirstMI = &*std::next(FirstMI->getIterator());
-      }
-    }
-  }
-
-  // Insert instruction sequence at block beginning (before vector operations).
-  const DebugLoc DL = MI.getDebugLoc();
-  const unsigned WavefrontSize = ST.getWavefrontSize();
-  const unsigned Mask = (WavefrontSize << 1) - 1;
-  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
-                   .addReg(InputReg)
-                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
-  if (LV)
-    LV->recomputeForSingleDefVirtReg(InputReg);
-  auto BfmMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
-          .addReg(CountReg)
-          .addImm(0);
-  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
-                   .addReg(CountReg, RegState::Kill)
-                   .addImm(WavefrontSize);
-  if (LV)
-    LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
-  auto CmovMI =
-      BuildMI(*MBB, FirstMI, DL,
-              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
-              Exec)
-          .addImm(-1);
-
-  if (!LIS) {
-    MI.eraseFromParent();
-    return;
-  }
-
-  LIS->RemoveMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
-
-  LIS->InsertMachineInstrInMaps(*BfeMI);
-  LIS->InsertMachineInstrInMaps(*BfmMI);
-  LIS->InsertMachineInstrInMaps(*CmpMI);
-  LIS->InsertMachineInstrInMaps(*CmovMI);
-
-  RecomputeRegs.insert(InputReg);
-  LIS->createAndComputeVirtRegInterval(CountReg);
-}
-
 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
   for (auto &I : MBB.instrs()) {
     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
@@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         SplitMBB = process(MI);
         Changed = true;
         break;
-
-      // FIXME: find a better place for this
-      case AMDGPU::SI_INIT_EXEC:
-      case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
-        lowerInitExec(MBB, MI);
-        if (LIS)
-          LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
-        Changed = true;
-        break;
-
-      default:
-        break;
       }
 
       if (SplitMBB != MBB) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 09dc1c781e2f3..f57faa86e90ca 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -177,6 +177,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
+  SmallVector<MachineInstr *, 4> InitExecInstrs;
 
   void printInfo();
 
@@ -223,6 +224,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void lowerLiveMaskQueries();
   void lowerCopyInstrs();
   void lowerKillInstrs(bool IsWQM);
+  void lowerInitExec(MachineInstr &MI);
+  void lowerInitExecInstrs();
 
 public:
   static char ID;
@@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                  Opcode == AMDGPU::SI_DEMOTE_I1) {
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
+      } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+        InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
         // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
   }
 }
 
+void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  bool IsWave32 = ST->isWave32();
+
+  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
+    // This should be before all vector instructions.
+    MachineInstr *InitMI =
+        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+                Exec)
+            .addImm(MI.getOperand(0).getImm());
+    if (LIS) {
+      LIS->RemoveMachineInstrFromMaps(MI);
+      LIS->InsertMachineInstrInMaps(*InitMI);
+    }
+    MI.eraseFromParent();
+    return;
+  }
+
+  // Extract the thread count from an SGPR input and set EXEC accordingly.
+  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+  //
+  // S_BFE_U32 count, input, {shift, 7}
+  // S_BFM_B64 exec, count, 0
+  // S_CMP_EQ_U32 count, 64
+  // S_CMOV_B64 exec, -1
+  Register InputReg = MI.getOperand(0).getReg();
+  MachineInstr *FirstMI = &*MBB->begin();
+  if (InputReg.isVirtual()) {
+    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
+    assert(DefInstr && DefInstr->isCopy());
+    if (DefInstr->getParent() == MBB) {
+      if (DefInstr != FirstMI) {
+        // If the `InputReg` is defined in current block, we also need to
+        // move that instruction to the beginning of the block.
+        DefInstr->removeFromParent();
+        MBB->insert(FirstMI, DefInstr);
+        if (LIS)
+          LIS->handleMove(*DefInstr);
+      } else {
+        // If first instruction is definition then move pointer after it.
+        FirstMI = &*std::next(FirstMI->getIterator());
+      }
+    }
+  }
+
+  // Insert instruction sequence at block beginning (before vector operations).
+  const DebugLoc DL = MI.getDebugLoc();
+  const unsigned WavefrontSize = ST->getWavefrontSize();
+  const unsigned Mask = (WavefrontSize << 1) - 1;
+  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
+                   .addReg(InputReg)
+                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
+  auto BfmMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
+          .addReg(CountReg)
+          .addImm(0);
+  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+                   .addReg(CountReg, RegState::Kill)
+                   .addImm(WavefrontSize);
+  auto CmovMI =
+      BuildMI(*MBB, FirstMI, DL,
+              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
+              Exec)
+          .addImm(-1);
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*BfeMI);
+  LIS->InsertMachineInstrInMaps(*BfmMI);
+  LIS->InsertMachineInstrInMaps(*CmpMI);
+  LIS->InsertMachineInstrInMaps(*CmovMI);
+
+  LIS->removeInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(InputReg);
+  LIS->createAndComputeVirtRegInterval(CountReg);
+}
+
+void SIWholeQuadMode::lowerInitExecInstrs() {
+  for (MachineInstr *MI : InitExecInstrs)
+    lowerInitExec(*MI);
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
                     << " ------------- \n");
@@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToCopyInstrs.clear();
   LowerToMovInstrs.clear();
   KillInstrs.clear();
+  InitExecInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1605,11 +1703,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
   // Shader is simple does not need any state changes or any complex lowering
   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
-      LowerToMovInstrs.empty() && KillInstrs.empty()) {
+      LowerToMovInstrs.empty() && KillInstrs.empty() &&
+      InitExecInstrs.empty()) {
     lowerLiveMaskQueries();
     return !LiveMaskQueries.empty();
   }
 
+  lowerInitExecInstrs();
+
   MachineBasicBlock &Entry = MF.front();
   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 

>From 2448b0e301d2fb3dc64e014ab521fb2fd66ac23f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 14:13:37 +0100
Subject: [PATCH 3/4] [AMDGPU] New test for WQM and llvm.amdgcn.init.exec

---
 llvm/test/CodeGen/AMDGPU/wqm.ll | 46 +++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 95dfb12c8dbae..15279fea91960 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3312,6 +3312,52 @@ main_body:
   ret void
 }
 
+; Test the interaction between wqm and llvm.amdgcn.init.exec.
+define amdgpu_gs void @wqm_init_exec() {
+; GFX9-W64-LABEL: wqm_init_exec:
+; GFX9-W64:       ; %bb.0: ; %bb
+; GFX9-W64-NEXT:    s_mov_b64 exec, -1
+; GFX9-W64-NEXT:    s_mov_b32 s0, 0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT:    s_mov_b32 s1, s0
+; GFX9-W64-NEXT:    s_mov_b32 s2, s0
+; GFX9-W64-NEXT:    s_mov_b32 s3, s0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $exec
+; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-W64-NEXT:    ds_write_b32 v0, v1
+; GFX9-W64-NEXT:    s_endpgm
+;
+; GFX10-W32-LABEL: wqm_init_exec:
+; GFX10-W32:       ; %bb.0: ; %bb
+; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s2, s0
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-W32-NEXT:    v_mov_b32_e32 v4, s0
+; GFX10-W32-NEXT:    s_mov_b32 s1, s0
+; GFX10-W32-NEXT:    s_mov_b32 s3, s0
+; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-W32-NEXT:    ds_write_b32 v0, v4
+; GFX10-W32-NEXT:    s_endpgm
+bb:
+  call void @llvm.amdgcn.init.exec(i64 -1)
+  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  %i = call i32 @llvm.amdgcn.wqm.i32(i32 0)
+  store i32 %i, i32 addrspace(3)* null, align 4
+  ret void
+}
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
 

>From d996f0a4887a7e4dd6d179b64e575a04eafe8ff7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 29 May 2024 13:55:55 +0100
Subject: [PATCH 4/4] [AMDGPU] Fix interaction between WQM and
 llvm.amdgcn.init.exec

Whole quad mode requires inserting a copy of the initial EXEC mask. In a
function that also uses llvm.amdgcn.init.exec, insert the COPY after
initializing EXEC.
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 24 ++++++++++++++++------
 llvm/test/CodeGen/AMDGPU/wqm.ll            |  2 +-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f57faa86e90ca..fe8ee0a4855c8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -225,7 +225,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void lowerCopyInstrs();
   void lowerKillInstrs(bool IsWQM);
   void lowerInitExec(MachineInstr &MI);
-  void lowerInitExecInstrs();
+  MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
 
 public:
   static char ID;
@@ -1648,9 +1648,23 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   LIS->createAndComputeVirtRegInterval(CountReg);
 }
 
-void SIWholeQuadMode::lowerInitExecInstrs() {
-  for (MachineInstr *MI : InitExecInstrs)
+/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
+/// for instructions that depend on EXEC.
+MachineBasicBlock::iterator
+SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
+  MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
+
+  for (MachineInstr *MI : InitExecInstrs) {
+    // Try to handle undefined cases gracefully:
+    // - multiple INIT_EXEC instructions
+    // - INIT_EXEC instructions not in the entry block
+    if (MI->getParent() == &Entry)
+      InsertPt = std::next(MI->getIterator());
+
     lowerInitExec(*MI);
+  }
+
+  return InsertPt;
 }
 
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
@@ -1709,10 +1723,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     return !LiveMaskQueries.empty();
   }
 
-  lowerInitExecInstrs();
-
   MachineBasicBlock &Entry = MF.front();
-  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+  MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry);
 
   // Store a copy of the original live mask when required
   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 15279fea91960..f3d5e557a409e 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3334,8 +3334,8 @@ define amdgpu_gs void @wqm_init_exec() {
 ;
 ; GFX10-W32-LABEL: wqm_init_exec:
 ; GFX10-W32:       ; %bb.0: ; %bb
-; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, -1
+; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    s_mov_b32 s0, 0
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo