[llvm] [amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic (PR #105822)

Fri Aug 23 05:29:56 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-ir

Author: Diana Picus (rovka)

<details>
<summary>Changes</summary>

This intrinsic is meant to be used in functions that have a "tail" that needs to be run with all the lanes enabled. The "tail" may contain complex control flow that makes it unsuitable for the use of the existing WWM intrinsics. Instead, we will pretend that the function starts with all the lanes enabled, then branches into the actual body of the function for the lanes that were meant to run it, and then finally all the lanes will rejoin and run the tail.

As such, the intrinsic will return the EXEC mask for the body of the function, and is meant to be used only as part of a very limited pattern (for now only in amdgpu_cs_chain functions):

```
entry:
  %func_exec = call i1 @llvm.amdgcn.init.whole.wave()
  br i1 %func_exec, label %func, label %tail

func:
  ; ... stuff that should run with the actual EXEC mask
  br label %tail

tail:
  ; ... stuff that runs with all the lanes enabled;
  ; can contain more than one basic block
```

It's an error to use the result of this intrinsic for anything other than a branch (but unfortunately checking that in the verifier is non-trivial because SIAnnotateControlFlow will introduce an amdgcn.if between the intrinsic and the branch).

Since the usage pattern is very strict, the backend can optimize away the intrinsic and the branch following it (in practice EXEC will already contain the correct value when entering the function, because it will be set by the llvm.amdgcn.cs.chain intrinsic before jumping in). The removal is done early on, in `finalizeLowering`, however the information that the function was conceptually started in whole wave mode is stored in the machine function info (hasInitWholeWave). This will be useful in prolog epilog insertion, where we can skip saving the inactive lanes for CSRs (since if the function started with all the lanes active, then there are no inactive lanes to preserve).

Some of the generated code could use some more optimization (#98864 might help with some of that). One important thing for front-ends to note is that for now it's recommended to avoid phi's in `tail` with large structs/vectors where not all elements are modified by `shader` - prefer to have small phi's and build the aggregate in `tail` (see the `basic` vs `phi-whole-struct` test cases).

---

Patch is 107.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105822.diff


21 Files Affected:

- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+14) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+5) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+10) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (+5) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+3) 
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+8-4) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+133) 
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+8) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+3) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (+1545) 
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+254) 
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (+29) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll (+2) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll (+1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (+1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (+1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir (+4) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (+4) 


``````````diff

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dc13a35c66f9ab..a552b758a1ace7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,6 +208,20 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
    IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
+// Sets the function into whole-wave-mode and returns whether the lane was
+// active when entering the function. A branch depending on this return will
+// revert the EXEC mask to what it was when entering the function, thus
+// resulting in a no-op. This pattern is used to optimize branches when function
+// tails need to be run in whole-wave-mode. It may also have other consequences
+// (mostly related to WWM CSR handling) that differentiate it from using
+// a plain `amdgcn.init.exec -1`.
+//
+// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
+// Using this intrinsic without immediately branching on its return value is an
+// error.
+def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
+    IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
+
 def int_amdgcn_wavefrontsize :
   ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 0daaf6b6576030..380dc7d3312f32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
+  case Intrinsic::amdgcn_init_whole_wave:
+    CurDAG->getMachineFunction()
+        .getInfo<SIMachineFunctionInfo>()
+        ->setInitWholeWave();
+    break;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 17071970ca4bfe..06192f74e1e0ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+  MFInfo->setInitWholeWave();
+  return selectImpl(MI, *CoverageInfo);
+}
+
 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, true);
   case Intrinsic::amdgcn_ds_consume:
     return selectDSAppendConsume(I, false);
+  case Intrinsic::amdgcn_init_whole_wave:
+    return selectInitWholeWave(I);
   case Intrinsic::amdgcn_s_barrier:
     return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 207cd67f0eda0e..2ddee05d096b23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+  bool selectInitWholeWave(MachineInstr &MI) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 7efb7f825348e3..b1022e48b8d34f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Kernel may need limited waves per EU for better performance.
   bool WaveLimiter = false;
 
+  bool HasInitWholeWave = false;
+
 public:
   AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
 
@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return WaveLimiter;
   }
 
+  bool hasInitWholeWave() const { return HasInitWholeWave; }
+  void setInitWholeWave() { HasInitWholeWave = true; }
+
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
     return allocateLDSGlobal(DL, GV, DynLDSAlign);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 69a1936a11fe05..5e7986734871cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4979,6 +4979,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
+    case Intrinsic::amdgcn_init_whole_wave:
     case Intrinsic::amdgcn_live_mask: {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 95c4859674ecc4..2cd5fb2b94285c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
+def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
 
 foreach intr = AMDGPUMFMAIntrinsics908 in
 def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7a9735790371a1..3fcc750646d6ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1731,6 +1731,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                                            ? DenormalMode::IEEE
                                            : DenormalMode::PreserveSign;
 
+  if (YamlMFI.HasInitWholeWave)
+    MFI->setInitWholeWave();
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 8c951105101d96..dfdc7ad32b00c7 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Allocate spill slots for WWM reserved VGPRs.
   // For chain functions, we only need to do this if we have calls to
-  // llvm.amdgcn.cs.chain.
-  bool IsChainWithoutCalls =
-      FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
-  if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
+  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
+  // chain functions do not return) and the function did not contain a call to
+  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
+  // when entering the function).
+  bool IsChainWithoutRestores =
+      FuncInfo->isChainFunction() &&
+      (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
+  if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecd4451c504727..b50c92df5479fc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15671,6 +15671,133 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
   }
 }
 
+static void removeInitWholeWaveBranch(MachineFunction &MF,
+                                      MachineRegisterInfo &MRI,
+                                      const SIInstrInfo *TII) {
+  // Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
+  // EXEC to -1 at SI_END_CF.
+  auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
+    return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
+  });
+  if (IWWIt == MF.begin()->instr_end())
+    return; // We've been here before (GISel runs finalizeLowering twice).
+
+  MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
+  assert(If.getOpcode() == AMDGPU::SI_IF &&
+         "Unexpected user for init.whole.wave result");
+  assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
+         "Expected simple control flow");
+
+  MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
+  MachineBasicBlock *EndBB = EndCf.getParent();
+
+  // Update all the Phis: since we're removing a predecessor, we need to remove
+  // the corresponding pair of operands. However, we can't just drop the value
+  // coming from the 'if' block - that's going to be the value of the inactive
+  // lanes.
+  // %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
+  // should become
+  // %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
+  // %v = v_set_inactive %t, %inactive
+  // Note that usually EndCf will be the first instruction after the phis and as
+  // such will serve as the end of the range when iterating over phis.
+  // Therefore, we shouldn't introduce any new instructions before it.
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  auto AfterEndCf = std::next(EndCf.getIterator());
+  for (auto &Phi : EndBB->phis()) {
+    Register PhiDest = Phi.getOperand(0).getReg();
+    const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
+
+    Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
+    Phi.getOperand(0).setReg(NewPhiDest);
+
+    unsigned InactiveOpIdx = 0;
+    for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
+      if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
+        InactiveOpIdx = I;
+        break;
+      }
+    }
+    assert(InactiveOpIdx != 0 && "Broken phi?");
+
+    // At this point, the register class could be larger than 32 or 64, so we
+    // might have to use more than one V_SET_INACTIVE instruction.
+    unsigned Size = TRI.getRegSizeInBits(*PhiRC);
+    switch (Size) {
+    case 32:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    case 64:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    default: {
+      // For each 32-bit subregister of the register at InactiveOpIdx, insert
+      // a COPY to a new register, and a V_SET_INACTIVE_B32 using the
+      // corresponding subregisters of PhiDest and NewPhiDest.
+      // FIXME: There has to be a better way to iterate over this...
+      llvm::SmallVector<Register, 16> PhiSubRegs;
+      const unsigned SubRegIndices[] = {
+          AMDGPU::sub0,  AMDGPU::sub1,  AMDGPU::sub2,  AMDGPU::sub3,
+          AMDGPU::sub4,  AMDGPU::sub5,  AMDGPU::sub6,  AMDGPU::sub7,
+          AMDGPU::sub8,  AMDGPU::sub9,  AMDGPU::sub10, AMDGPU::sub11,
+          AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+          AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+          AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+          AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+          AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
+      const unsigned NumSubRegs = Size / 32;
+      assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
+             "Not enough subregister indices");
+      for (unsigned I = 0; I != NumSubRegs; ++I) {
+        unsigned SubRegIdx = SubRegIndices[I];
+        Register InactiveSubReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
+                InactiveSubReg)
+            .addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
+
+        Register AllLanesSubReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+                TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
+            .addReg(NewPhiDest, 0, SubRegIdx)
+            .addReg(InactiveSubReg);
+        PhiSubRegs.push_back(AllLanesSubReg);
+      }
+      // Now we need to combine the subregisters into the original register.
+      auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+                                 TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
+      for (unsigned I = 0; I < NumSubRegs; ++I) {
+        RegSequence.addReg(PhiSubRegs[I]);
+        RegSequence.addImm(SubRegIndices[I]);
+      }
+      break;
+    }
+    }
+
+    Phi.removeOperand(InactiveOpIdx + 1);
+    Phi.removeOperand(InactiveOpIdx);
+  }
+  If.getParent()->removeSuccessor(EndBB);
+
+  BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
+          TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
+                       ? AMDGPU::S_MOV_B32
+                       : AMDGPU::S_MOV_B64),
+          TII->getRegisterInfo().getExec())
+      .addImm(-1);
+
+  EndCf.eraseFromParent();
+  If.eraseFromParent();
+  IWWIt->eraseFromParent();
+}
+
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@@ -15681,6 +15808,12 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
+  if (Info->hasInitWholeWave()) {
+    assert(Info->isChainFunction() &&
+           "init.whole.wave may only be used in chain functions");
+    removeInitWholeWaveBranch(MF, MRI, TII);
+  }
+
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.
     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 814d3182fb5df8..ffffa18e13c30d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -575,6 +575,14 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   let Defs = [EXEC];
 }
 
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
+  let Defs = [EXEC];
+  let Uses = [EXEC];
+}
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841e..7cebfa29fe7b8d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue SGPRForEXECCopy;
   StringValue LongBranchReservedReg;
 
+  bool HasInitWholeWave = false;
+
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
                         const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                        StringValue());
+    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
   }
 };
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
new file mode 100644
index 00000000000000..6ad9e684273b28
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -0,0 +1,1545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+
+define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: basic:
+; GISEL12:       ; %bb.0: ; %entry
+; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT:    s_wait_expcnt 0x0
+; GISEL12-NEXT:    s_wait_samplecnt 0x0
+; GISEL12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL12-NEXT:    s_wait_kmcnt 0x0
+; GISEL12-NEXT:    s_mov_b32 s6, s3
+; GISEL12-NEXT:    s_or_saveexec_b32 s3, -1
+; GISEL12-NEXT:    v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
+; GISEL12-NEXT:    s_mov_b32 exec_lo, s3
+; GISEL12-NEXT:    s_mov_b32 s7, s4
+; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL12-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    v_mov_b32_e32 v2, 0x47
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    s_or_saveexec_b32 s3, -1
+; GISEL12-NEXT:    v_cmp_ne_u32_e64 s4, 0, v2
+; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT:    v_mov_b32_e32 v2, s4
+; GISEL12-NEXT:    s_mov_b32 exec_lo, s3
+; GISEL12-NEXT:    v_dual_mov_b32 v4, v2 :: v_dual_add_nc_u32 v3, 42, v0
+; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL12-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GISEL12-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL12-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL12-NEXT:    v_dual_mov_b32 v10, v0 :: v_dual_mov_b32 v11, v1
+; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
+; GISEL12-NEXT:    s_setpc_b64 s[6:7]
+;
+; DAGISEL12-LABEL: basic:
+; DAGISEL12:       ; %bb.0: ; %entry
+; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT:    s_wait_expcnt 0x0
+; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL12-NEXT:    s_or_saveexec_b32 s6, -1
+; DAGISEL12-NEXT:    v_dual_mov_b32 v0, v13 :: v_dual_mov_b32 v1, v12
+; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s6
+; DAGISEL12-NEXT:    s_mov_b32 s7, s4
+; DAGISEL12-NEXT:    s_mov_b32 s6, s3
+; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; DAGISEL12-NEXT:    v_mov_b32_e32 v2, v1
+; DAGISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL12-NEXT:    v_mov_b32_e32 v2, 0x47
+; DAGISEL12-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL12-NEXT:    s_or_saveexec_b32 s3, -1
+; DAGISEL12-NEXT:    v_...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/105822