[llvm] 20d8398 - [AMDGPU] ISel & PEI for whole wave functions (#145858)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 21 01:39:13 PDT 2025


Author: Diana Picus
Date: 2025-07-21T10:39:09+02:00
New Revision: 20d8398825a799008ae508d8463dbb9b11df81e7

URL: https://github.com/llvm/llvm-project/commit/20d8398825a799008ae508d8463dbb9b11df81e7
DIFF: https://github.com/llvm/llvm-project/commit/20d8398825a799008ae508d8463dbb9b11df81e7.diff

LOG: [AMDGPU] ISel & PEI for whole wave functions (#145858)

Whole wave functions are functions that will run with a full EXEC mask.
They will not be invoked directly, but instead will be launched by way
of a new intrinsic, `llvm.amdgcn.call.whole.wave` (to be added in
a future patch). These functions are meant as an alternative to the
`llvm.amdgcn.init.whole.wave` or `llvm.amdgcn.strict.wwm` intrinsics.

Whole wave functions will set EXEC to -1 in the prologue and restore the
original value of EXEC in the epilogue. They must have a special first
argument, `i1 %active`, that is going to be mapped to EXEC. They may
have either the default calling convention or amdgpu_gfx. The inactive
lanes need to be preserved for all registers used, active lanes only for
the CSRs.

At the IR level, arguments to a whole wave function (other than
`%active`) contain poison in their inactive lanes. Likewise, the return
value for the inactive lanes is poison.

This patch contains the following work:
* 2 new pseudos, SI_SETUP_WHOLE_WAVE_FUNC and SI_WHOLE_WAVE_FUNC_RETURN
  used for managing the EXEC mask. SI_SETUP_WHOLE_WAVE_FUNC will return
  a SReg_1 representing `%active`, which needs to be passed into
  SI_WHOLE_WAVE_FUNC_RETURN.
* SelectionDAG support for generating these 2 new pseudos and the
  special handling of %active. Since the return may be in a different
  basic block, it's difficult to add the virtual reg for %active to
  SI_WHOLE_WAVE_FUNC_RETURN, so we initially generate an IMPLICIT_DEF
  which is later replaced via a custom inserter.
* Expansion of the 2 pseudos during prolog/epilog insertion. PEI also
  marks any used VGPRs as WWM registers, which are then spilled and
  restored with the usual logic.

Future patches will include the `llvm.amdgcn.call.whole.wave` intrinsic
and a lot of optimization work (especially in order to reduce spills
around function calls).

---------

Co-authored-by: Matt Arsenault <Matthew.Arsenault at amd.com>
Co-authored-by: Shilei Tian <i at tianshilei.me>

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
    llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
    llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
    llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
    llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/include/llvm/AsmParser/LLToken.h
    llvm/include/llvm/IR/CallingConv.h
    llvm/lib/AsmParser/LLLexer.cpp
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/IR/AsmWriter.cpp
    llvm/lib/IR/Function.cpp
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUGISel.td
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
    llvm/test/Bitcode/compatibility.ll
    llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
    llvm/test/Verifier/amdgpu-cc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c5b9bd9de66e1..19357635ecfc1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions:
                                      ..TODO::
                                      Describe.
 
+     ``amdgpu_gfx_whole_wave``       Used for AMD graphics targets. Functions with this calling convention
+                                     cannot be used as entry points. They must have an i1 as the first argument,
+                                     which will be mapped to the value of EXEC on entry into the function. Other
+                                     arguments will contain poison in their inactive lanes. Similarly, the return
+                                     value for the inactive lanes is poison.
+
+                                     The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
+                                     prologue and restored to its original value in the epilogue. The inactive lanes
+                                     will be preserved for all the registers used by the function. Active lanes only
+                                     will only be preserved for the callee saved registers.
+
+                                     In all other respects, functions with this calling convention behave like
+                                     ``amdgpu_gfx`` functions.
+
      ``amdgpu_gs``                   Used for Mesa/AMDPAL geometry shaders.
                                      ..TODO::
                                      Describe.

diff  --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index c7e4bdf3ff811..a2311d2ac285d 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
   kw_amdgpu_cs_chain_preserve,
   kw_amdgpu_kernel,
   kw_amdgpu_gfx,
+  kw_amdgpu_gfx_whole_wave,
   kw_tailcc,
   kw_m68k_rtdcc,
   kw_graalcc,

diff  --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index d68491eb5535c..ef761eb1aed73 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -284,6 +284,9 @@ namespace CallingConv {
     RISCV_VLSCall_32768 = 122,
     RISCV_VLSCall_65536 = 123,
 
+    // Calling convention for AMDGPU whole wave functions.
+    AMDGPU_Gfx_WholeWave = 124,
+
     /// The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
   };
@@ -294,8 +297,13 @@ namespace CallingConv {
 /// directly or indirectly via a call-like instruction.
 constexpr bool isCallableCC(CallingConv::ID CC) {
   switch (CC) {
+  // Called with special intrinsics:
+  // llvm.amdgcn.cs.chain
   case CallingConv::AMDGPU_CS_Chain:
   case CallingConv::AMDGPU_CS_ChainPreserve:
+  // llvm.amdgcn.call.whole.wave
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+  // Hardware entry points:
   case CallingConv::AMDGPU_CS:
   case CallingConv::AMDGPU_ES:
   case CallingConv::AMDGPU_GS:

diff  --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ce813e1d7b1c4..520c6a00a9c07 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_cs_chain_preserve);
   KEYWORD(amdgpu_kernel);
   KEYWORD(amdgpu_gfx);
+  KEYWORD(amdgpu_gfx_whole_wave);
   KEYWORD(tailcc);
   KEYWORD(m68k_rtdcc);
   KEYWORD(graalcc);

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7f6950f679ef..00277757c0955 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
     CC = CallingConv::AMDGPU_CS_ChainPreserve;
     break;
   case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
+  case lltok::kw_amdgpu_gfx_whole_wave:
+    CC = CallingConv::AMDGPU_Gfx_WholeWave;
+    break;
   case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_m68k_rtdcc:     CC = CallingConv::M68k_RTD; break;
   case lltok::kw_graalcc:        CC = CallingConv::GRAAL; break;

diff  --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 145ef10f28f35..3e40915b6a920 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
     break;
   case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
   case CallingConv::AMDGPU_Gfx:    Out << "amdgpu_gfx"; break;
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+    Out << "amdgpu_gfx_whole_wave";
+    break;
   case CallingConv::M68k_RTD:      Out << "m68k_rtdcc"; break;
   case CallingConv::RISCV_VectorCall:
     Out << "riscv_vector_cc";

diff  --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 7a03663e129dc..fc067459dcba3 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
   case CallingConv::AArch64_SVE_VectorCall:
   case CallingConv::WASM_EmscriptenInvoke:
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
   case CallingConv::M68k_INTR:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9bd573e773610..e7b491e76724e 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2979,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
           "perfect forwarding!",
           &F);
     break;
+  case CallingConv::AMDGPU_Gfx_WholeWave:
+    Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
+          "Calling convention requires first argument to be i1", &F);
+    Check(!F.arg_begin()->hasInRegAttr(),
+          "Calling convention requires first argument to not be inreg", &F);
+    Check(!F.isVarArg(),
+          "Calling convention does not support varargs or "
+          "perfect forwarding!",
+          &F);
+    break;
   }
 
   // Check that the argument values match the function type for this function...

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 14101e57f5143..3d8d274f06246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
     return true;
   }
 
-  unsigned ReturnOpc =
-      IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+  const bool IsWholeWave = MFI->isWholeWaveFunction();
+  unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
+                       : IsShader  ? AMDGPU::SI_RETURN_TO_EPILOG
+                                   : AMDGPU::SI_RETURN;
   auto Ret = B.buildInstrNoInsert(ReturnOpc);
 
   if (!FLI.CanLowerReturn)
@@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
   else if (!lowerReturnVal(B, Val, VRegs, Ret))
     return false;
 
+  if (IsWholeWave)
+    addOriginalExecToReturn(B.getMF(), Ret);
+
   // TODO: Handle CalleeSavedRegsViaCopy.
 
   B.insertInstr(Ret);
@@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     if (DL.getTypeStoreSize(Arg.getType()) == 0)
       continue;
 
+    if (Info->isWholeWaveFunction() && Idx == 0) {
+      assert(VRegs[Idx].size() == 1 && "Expected only one register");
+
+      // The first argument for whole wave functions is the original EXEC value.
+      B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+          .addDef(VRegs[Idx][0]);
+
+      ++Idx;
+      continue;
+    }
+
     const bool InReg = Arg.hasAttribute(Attribute::InReg);
 
     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
   if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+      Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
       !AMDGPU::isChainCC(Info.CallConv)) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
@@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+      Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
@@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   return true;
 }
+
+void AMDGPUCallLowering::addOriginalExecToReturn(
+    MachineFunction &MF, MachineInstrBuilder &Ret) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
+  Ret.addReg(Setup->getOperand(0).getReg());
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..e0033d59d10bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
   bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
                       ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
 
+  void addOriginalExecToReturn(MachineFunction &MF,
+                               MachineInstrBuilder &Ret) const;
+
 public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2bfd56f9f3554..891d362503f15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -315,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
 
+def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
+// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
+// so we don't mark it as equivalent.
+
 class GISelSop2Pat <
   SDPatternOperator node,
   Instruction inst,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d040fb705a8d..b037cdd5393ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1143,6 +1143,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::Cold:
     return CC_AMDGPU_Func;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return CC_SI_Gfx;
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
@@ -1168,6 +1169,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
   case CallingConv::AMDGPU_LS:
     return RetCC_SI_Shader;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return RetCC_SI_Gfx;
   case CallingConv::C:
   case CallingConv::Fast:
@@ -5875,6 +5877,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
   NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
+  NODE_NAME_CASE(WHOLE_WAVE_SETUP)
+  NODE_NAME_CASE(WHOLE_WAVE_RETURN)
   }
   return nullptr;
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4e8c6c7ea3b27..39bb0adfc1a17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -608,6 +608,12 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_FMAX,
   BUFFER_ATOMIC_COND_SUB_U32,
   LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
+
+  // Set up a whole wave function.
+  WHOLE_WAVE_SETUP,
+
+  // Return from a whole wave function.
+  WHOLE_WAVE_RETURN,
 };
 
 } // End namespace AMDGPUISD

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ce58e93a15207..e305f08925cc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
+// Marks the entry into a whole wave function.
+def AMDGPUwhole_wave_setup : SDNode<
+  "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+// Marks the return from a whole wave function.
+def AMDGPUwhole_wave_return : SDNode<
+  "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
   SDTCisInt<0>,       // i8 tgt

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d161c035ac295..8975486caa770 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4160,6 +4160,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return true;
   case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
     return selectWaveAddress(I);
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
+    I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
+    return true;
+  }
   case AMDGPU::G_STACKRESTORE:
     return selectStackRestore(I);
   case AMDGPU::G_PHI:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bf2f37bddb9ed..b54cccead9781 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5540,6 +5540,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_PREFETCH:
     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
     break;
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
+  case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+    break;
   }
 
   return getInstructionMapping(/*ID*/1, /*Cost*/1,

diff  --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bbed828b4fed3..c4a3be44fc72d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3206,7 +3206,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
   // Check entry priority at each export (as there will only be a few).
   // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
   bool Changed = false;
-  if (CC != CallingConv::AMDGPU_Gfx)
+  if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
     Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
 
   auto NextMI = std::next(It);

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6a3867937d57f..11552b3a9a438 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
 
   initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
 
-  ScratchExecCopy = findScratchNonCalleeSaveRegister(
-      MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+  if (FuncInfo->isWholeWaveFunction()) {
+    // Whole wave functions already have a copy of the original EXEC mask that
+    // we can use.
+    assert(IsProlog && "Epilog should look at return, not setup");
+    ScratchExecCopy =
+        TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
+    assert(ScratchExecCopy && "Couldn't find copy of EXEC");
+  } else {
+    ScratchExecCopy = findScratchNonCalleeSaveRegister(
+        MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+  }
+
   if (!ScratchExecCopy)
     report_fatal_error("failed to find free scratch register");
 
@@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
       };
 
   StoreWWMRegisters(WWMScratchRegs);
+
+  auto EnableAllLanes = [&]() {
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+  };
+
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
-      unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+      EnableAllLanes();
     } else {
       ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
                                              /*IsProlog*/ true,
@@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
   }
 
   StoreWWMRegisters(WWMCalleeSavedRegs);
-  if (ScratchExecCopy) {
+  if (FuncInfo->isWholeWaveFunction()) {
+    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
+    // it now. If we have already saved some WWM CSR registers, then the EXEC is
+    // already -1 and we don't need to do anything else. Otherwise, set EXEC to
+    // -1 here.
+    if (!ScratchExecCopy)
+      buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
+                           /*EnableInactiveLanes*/ true);
+    else if (WWMCalleeSavedRegs.empty())
+      EnableAllLanes();
+    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+  } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
@@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
   Register ScratchExecCopy;
   SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
   FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
-  if (!WWMScratchRegs.empty())
-    ScratchExecCopy =
-        buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
-                             /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
-
   auto RestoreWWMRegisters =
       [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
         for (const auto &Reg : WWMRegs) {
@@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
         }
       };
 
+  if (FuncInfo->isWholeWaveFunction()) {
+    // For whole wave functions, the EXEC is already -1 at this point.
+    // Therefore, we can restore the CSR WWM registers right away.
+    RestoreWWMRegisters(WWMCalleeSavedRegs);
+
+    // The original EXEC is the first operand of the return instruction.
+    const MachineInstr &Return = MBB.instr_back();
+    assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
+           "Unexpected return inst");
+    Register OrigExec = Return.getOperand(0).getReg();
+
+    if (!WWMScratchRegs.empty()) {
+      unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+      BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+          .addReg(OrigExec)
+          .addImm(-1);
+      RestoreWWMRegisters(WWMScratchRegs);
+    }
+
+    // Restore original EXEC.
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+    return;
+  }
+
+  if (!WWMScratchRegs.empty()) {
+    ScratchExecCopy =
+        buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
+                             /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
+  }
   RestoreWWMRegisters(WWMScratchRegs);
   if (!WWMCalleeSavedRegs.empty()) {
     if (ScratchExecCopy) {
@@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
         NeedExecCopyReservedReg = true;
       else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
                MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+               MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
                (MFI->isChainFunction() &&
                 TII->isChainCallOpcode(MI.getOpcode()))) {
         // We expect all return to be the same size.
@@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isEntryFunction())
     return;
 
+  if (MFI->isWholeWaveFunction()) {
+    // In practice, all the VGPRs are WWM registers, and we will need to save at
+    // least their inactive lanes. Add them to WWMReservedRegs.
+    assert(!NeedExecCopyReservedReg &&
+           "Whole wave functions can use the reg mapped for their i1 argument");
+
+    // FIXME: Be more efficient!
+    for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+      if (MF.getRegInfo().isPhysRegModified(Reg)) {
+        MFI->reserveWWMRegister(Reg);
+        MF.begin()->addLiveIn(Reg);
+      }
+    MF.begin()->sortUniqueLiveIns();
+  }
+
   // Remove any VGPRs used in the return value because these do not need to be saved.
   // This prevents CSR restore from clobbering return VGPRs.
   if (ReturnMI) {

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2ec5ea7..d4e3fa71ada85 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2260,7 +2260,8 @@ SDValue SITargetLowering::getPreloadedValue(
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
   if (Subtarget->hasArchitectedSGPRs() &&
-      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+       CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
     switch (PVID) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Reg = &WorkGroupIDX;
@@ -2942,12 +2943,15 @@ SDValue SITargetLowering::LowerFormalArguments(
     if (!Subtarget->enableFlatScratch())
       assert(!UserSGPRInfo.hasFlatScratchInit());
     if ((CallConv != CallingConv::AMDGPU_CS &&
-         CallConv != CallingConv::AMDGPU_Gfx) ||
+         CallConv != CallingConv::AMDGPU_Gfx &&
+         CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
         !Subtarget->hasArchitectedSGPRs())
       assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
              !Info->hasWorkGroupIDZ());
   }
 
+  bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
   if (CallConv == CallingConv::AMDGPU_PS) {
     processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
 
@@ -2988,7 +2992,8 @@ SDValue SITargetLowering::LowerFormalArguments(
   } else if (IsKernel) {
     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
-    Splits.append(Ins.begin(), Ins.end());
+    Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+                  Ins.end());
   }
 
   if (IsKernel)
@@ -3019,6 +3024,13 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   SmallVector<SDValue, 16> Chains;
 
+  if (IsWholeWaveFunc) {
+    SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+                                {MVT::i1, MVT::Other}, Chain);
+    InVals.push_back(Setup.getValue(0));
+    Chains.push_back(Setup.getValue(1));
+  }
+
   // FIXME: This is the minimum kernel argument alignment. We should improve
   // this to the maximum alignment of the arguments.
   //
@@ -3026,7 +3038,8 @@ SDValue SITargetLowering::LowerFormalArguments(
   // kern arg offset.
   const Align KernelArgBaseAlign = Align(16);
 
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+       ++i) {
     const ISD::InputArg &Arg = Ins[i];
     if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
       InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3374,7 +3387,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   unsigned Opc = AMDGPUISD::ENDPGM;
   if (!IsWaveEnd)
-    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+    Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+          : IsShader                  ? AMDGPUISD::RETURN_TO_EPILOG
+                                      : AMDGPUISD::RET_GLUE;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -3876,7 +3891,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+  if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+      CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -5890,6 +5906,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent();
     return SplitBB;
   }
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+    assert(MFI->isWholeWaveFunction());
+
+    // During ISel, it's 
diff icult to propagate the original EXEC mask to use as
+    // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+    MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+    Register OriginalExec = Setup->getOperand(0).getReg();
+    assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+    MF->getRegInfo().clearKillFlags(OriginalExec);
+    MI.getOperand(0).setReg(OriginalExec);
+    return BB;
+  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2af0a575a8885..9faf4974e3fd6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
       MI.getOpcode() == AMDGPU::SI_RETURN ||
+      MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c8935f0cb6034..e2a2525d909bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
   case AMDGPU::SI_RETURN: {
     const MachineFunction *MF = MBB.getParent();
     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -5757,6 +5758,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
     Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
 }
 
+MachineInstr *
+SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
+  assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
+         "Not a whole wave func");
+  MachineBasicBlock &MBB = *MF.begin();
+  for (MachineInstr &MI : MBB)
+    if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
+        MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+      return &MI;
+
+  llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
+}
+
 static const TargetRegisterClass *
 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
                           const MachineRegisterInfo &MRI,

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5e92921f3ea21..800ea9ab50b85 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1215,6 +1215,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                    Register Reg, SlotIndexes *Indexes = nullptr) const;
 
+  MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const;
+
   /// Return the correct register class for \p OpNo.  For target-specific
   /// instructions, this will return the register class that has been defined
   /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 991d9f83e92e4..2230a431a0f26 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
   let isConvergent = 1;
 }
 
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI <
+  (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> {
+  let Defs = [EXEC];
+  let Uses = [EXEC];
+
+  let isConvergent = 1;
+}
+
+// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN.
+def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
+  (outs), (ins SReg_1:$orig_exec)> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let SchedRW = [WriteBranch];
+
+  // We're going to use custom handling to set the $orig_exec to the correct value.
+  let usesCustomInserter = 1;
+}
+
+// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+  (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -4300,6 +4326,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$origExec);
+  let InOperandList = (ins);
+  let isConvergent = 1;
+}
+
+def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins type0:$origExec);
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
 // This is equivalent to the G_INTRINSIC*, but the operands may have
 // been legalized depending on the subtarget requirements.
 def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8c2e9b620ad16..f0be204cd9bdb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -51,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
-      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+      IsWholeWaveFunction(F.getCallingConv() ==
+                          CallingConv::AMDGPU_Gfx_WholeWave) {
   const GCNSubtarget &ST = *STI;
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
@@ -99,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
 
     ImplicitArgPtr = false;
   } else if (!isEntryFunction()) {
-    if (CC != CallingConv::AMDGPU_Gfx)
+    if (CC != CallingConv::AMDGPU_Gfx &&
+        CC != CallingConv::AMDGPU_Gfx_WholeWave)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
     FrameOffsetReg = AMDGPU::SGPR33;
@@ -732,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
       MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
       Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+      IsWholeWaveFunction(MFI.isWholeWaveFunction()),
       DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
       ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -778,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
   BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
   ReturnsVoid = YamlMFI.ReturnsVoid;
+  IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
 
   if (YamlMFI.ScavengeFI) {
     auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..08b0206d244fb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue LongBranchReservedReg;
 
   bool HasInitWholeWave = false;
+  bool IsWholeWaveFunction = false;
 
   unsigned DynamicVGPRBlockSize = 0;
   unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
     YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                        MFI.ScratchReservedForDynamicVGPRs, 0);
+    YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
   }
 };
 
@@ -565,6 +567,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // the serialization easier.
   ReservedRegSet WWMReservedRegs;
 
+  bool IsWholeWaveFunction = false;
+
   using PrologEpilogSGPRSpill =
       std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
   // To track the SGPR spill method used for a CSR SGPR register during
@@ -670,6 +674,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return WWMReservedRegs.contains(Reg);
   }
 
+  bool isWholeWaveFunction() const { return IsWholeWaveFunction; }
+
   ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
     assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
     return PrologEpilogSGPRSpills;

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fa2b8db6ba55a..84cfa878276fd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
                                : CSR_AMDGPU_SaveList;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
                                : CSR_AMDGPU_SI_Gfx_SaveList;
   case CallingConv::AMDGPU_CS_ChainPreserve:
@@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
                                : CSR_AMDGPU_RegMask;
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
                                : CSR_AMDGPU_SI_Gfx_RegMask;
   case CallingConv::AMDGPU_CS_Chain:

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c9d2c286bf237..2d344f41ff790 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1423,7 +1423,8 @@ constexpr bool isShader(CallingConv::ID CC) {
 
 LLVM_READNONE
 constexpr bool isGraphics(CallingConv::ID CC) {
-  return isShader(CC) || CC == CallingConv::AMDGPU_Gfx;
+  return isShader(CC) || CC == CallingConv::AMDGPU_Gfx ||
+         CC == CallingConv::AMDGPU_Gfx_WholeWave;
 }
 
 LLVM_READNONE

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index e464470143e52..fd6253daa327a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) {
   case CallingConv::AMDGPU_LS:
     return ".ls";
   case CallingConv::AMDGPU_Gfx:
+  case CallingConv::AMDGPU_Gfx_WholeWave:
     llvm_unreachable("Callable shader has no hardware stage");
   default:
     return ".cs";

diff  --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 9cf3fdbe550b4..0b5ce08c00a23 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
 ; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
 declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
 ; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
+declare cc124 void @f.cc124(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1)
+declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
 declare cc1023 void @f.cc1023()
 ; CHECK: declare cc1023 void @f.cc1023()
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
new file mode 100644
index 0000000000000..beca901945753
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+---
+name:            basic_test
+legalized:       true
+machineFunctionInfo:
+  isWholeWaveFunction: true
+body:             |
+  bb.1:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: basic_test
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+    ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+    ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+    %12:_(s32) = G_CONSTANT i32 5
+    %11:_(s32) = G_SELECT %0(s1), %1, %12
+    %14:_(s32) = G_CONSTANT i32 3
+    %13:_(s32) = G_SELECT %0(s1), %2, %14
+    %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0
+    $vgpr0 = COPY %15(s32)
+    G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
new file mode 100644
index 0000000000000..b68786b579dd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: basic_test
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]]
+  ; CHECK-NEXT:   [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]]
+  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+  ; CHECK-NEXT:   $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: unused_active
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+  ; CHECK-NEXT:   $vgpr0 = COPY [[C]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+  ; CHECK-LABEL: name: multiple_blocks
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INT]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.then:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.if.end:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32)
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[SELECT]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+  ; CHECK-LABEL: name: ret_64
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]]
+  ; CHECK-NEXT:   [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]]
+  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK-NEXT:   G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
new file mode 100644
index 0000000000000..3450d63ff7b4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: basic_test
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; DAGISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: basic_test
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; GISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: unused_active
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: unused_active
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+  ; GISEL-NEXT:   $vgpr0 = COPY [[S_MOV_B32_]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+  ; DAGISEL-LABEL: name: multiple_blocks
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+  ; DAGISEL-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec
+  ; DAGISEL-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; DAGISEL-NEXT:   S_BRANCH %bb.1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.1.if.then:
+  ; DAGISEL-NEXT:   successors: %bb.2(0x80000000)
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.2.if.end:
+  ; DAGISEL-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1
+  ; DAGISEL-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]]
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  ;
+  ; GISEL-LABEL: name: multiple_blocks
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+  ; GISEL-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GISEL-NEXT:   S_BRANCH %bb.2
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.2.if.then:
+  ; GISEL-NEXT:   successors: %bb.3(0x80000000)
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.3.if.end:
+  ; GISEL-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2
+  ; GISEL-NEXT:   SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+  ; DAGISEL-LABEL: name: ret_64
+  ; DAGISEL: bb.0 (%ir-block.0):
+  ; DAGISEL-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; DAGISEL-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; DAGISEL-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; DAGISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; DAGISEL-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+  ; DAGISEL-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; DAGISEL-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; DAGISEL-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+  ; DAGISEL-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; DAGISEL-NEXT:   [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+  ; DAGISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; DAGISEL-NEXT:   $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+  ; DAGISEL-NEXT:   [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; DAGISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+  ;
+  ; GISEL-LABEL: name: ret_64
+  ; GISEL: bb.1 (%ir-block.0):
+  ; GISEL-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GISEL-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GISEL-NEXT:   [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+  ; GISEL-NEXT:   $vgpr0 = COPY [[V_MOV_B32_dpp]]
+  ; GISEL-NEXT:   $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+  ; GISEL-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}
+

diff  --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
new file mode 100644
index 0000000000000..93f489170cea0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
@@ -0,0 +1,448 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s
+
+---
+name:            save_inactive_lanes_non_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            save_all_lanes_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: save_all_lanes_csr_vgpr
+    ; CHECK: liveins: $vgpr40
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+
+...
+---
+name:            save_csr_sgpr_to_non_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr191
+    ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr
+    ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+    ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr20 = S_MOV_B32 14, implicit $exec
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            save_csr_sgpr_to_csr_vgpr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr191
+    ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr
+    ; CHECK: liveins: $sgpr20, $vgpr191
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr20 = S_MOV_B32 14, implicit $exec
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            vgpr_and_sgpr_csr
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    4
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  hasSpilledSGPRs: true
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  spillPhysVGPRs:
+    - '$vgpr191'
+  wwmReservedRegs:
+    - '$vgpr191'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+    ; CHECK-LABEL: name: vgpr_and_sgpr_csr
+    ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name:            split_orig_exec
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    4
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  hasSpilledSGPRs: true
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  spillPhysVGPRs:
+    - '$vgpr191'
+  wwmReservedRegs:
+    - '$vgpr191'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+    ; CHECK-LABEL: name: split_orig_exec
+    ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+    $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+    $sgpr3 = COPY $vcc_lo
+    S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+
+...
+---
+name:            vgpr_superregs
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: vgpr_superregs
+    ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+    ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+    S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            dont_restore_used_vgprs
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr20' }
+  - { reg: '$vgpr40' }
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr20, $vgpr40
+
+    ; CHECK-LABEL: name: dont_restore_used_vgprs
+    ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+    ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+    ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+    renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name:            multiple_blocks
+alignment:       1
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$vgpr0' }
+  - { reg: '$vgpr1' }
+frameInfo:
+  maxAlignment:    1
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  returnsVoid:     false
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+  isWholeWaveFunction: true
+body:             |
+  ; CHECK-LABEL: name: multiple_blocks
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $exec_lo = S_MOV_B32 -1
+  ; CHECK-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; CHECK-NEXT:   V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+  ; CHECK-NEXT:   renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+  ; CHECK-NEXT:   $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+  ; CHECK-NEXT:   $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   $exec_lo = S_MOV_B32 $vcc_lo
+  ; CHECK-NEXT:   SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $vgpr0, $vgpr1
+
+    renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+    $sgpr1 = S_MOV_B32 $exec_lo
+    V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+    renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+
+  bb.2:
+    liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+    $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+    renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+    SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
new file mode 100644
index 0000000000000..53d02925fb1c2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -0,0 +1,2414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s
+
+; Make sure the i1 %active is passed through EXEC.
+; The EXEC mask should be set to -1 for the duration of the function
+; and restored to its original value in the epilogue.
+; We will also need to restore the inactive lanes for any allocated VGPRs.
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: basic_test:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: basic_test:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if there's only one use for %active.
+define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: single_use_of_active:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: single_use_of_active:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: single_use_of_active:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: single_use_of_active:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 17, v1, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %y = select i1 %active, i32 %b, i32 17
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: unused_active:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_mov_b32_e32 v0, 14
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: unused_active:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: unused_active:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_mov_b32_e32 v0, 14
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: unused_active:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    v_mov_b32_e32 v0, 14
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  ret i32 14
+}
+
+; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes.
+; For CSR VGPRs, we need to restore all lanes.
+define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    v_writelane_b32 v2, s20, 0
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber non-CSR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT:    v_readlane_b32 s20, v2, 0
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    v_writelane_b32 v2, s20, 0
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber non-CSR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT:    v_readlane_b32 s20, v2, 0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xf1ff
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    v_writelane_b32 v2, s20, 0
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber non-CSR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT:    v_readlane_b32 s20, v2, 0
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xf1ff
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v49, s32 offset:16
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    v_writelane_b32 v2, s20, 0
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber non-CSR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT:    v_readlane_b32 s20, v2, 0
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v49, off, s32 offset:16
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xf1ff
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i32 %a, i32 5
+  %y = select i1 %active, i32 %b, i32 3
+  call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"()
+  call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"()
+  %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i32 %ret
+}
+
+; Save and restore all lanes of v40.
+define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr_vgpr_only:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR VGPR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr_vgpr_only:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR VGPR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr_vgpr_only:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR VGPR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr_vgpr_only:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR VGPR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber CSR VGPR", "~{v40}"()
+  ret void
+}
+
+define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: sgpr_spill_only:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_writelane_b32 v0, s68, 0
+; DAGISEL-NEXT:    ;;#ASMSTART
+; DAGISEL-NEXT:    ; clobber CSR SGPR
+; DAGISEL-NEXT:    ;;#ASMEND
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s68, v0, 0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sgpr_spill_only:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    v_writelane_b32 v0, s68, 0
+; GISEL-NEXT:    ;;#ASMSTART
+; GISEL-NEXT:    ; clobber CSR SGPR
+; GISEL-NEXT:    ;;#ASMEND
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s68, v0, 0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: sgpr_spill_only:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_writelane_b32 v0, s68, 0
+; DAGISEL64-NEXT:    ;;#ASMSTART
+; DAGISEL64-NEXT:    ; clobber CSR SGPR
+; DAGISEL64-NEXT:    ;;#ASMEND
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s68, v0, 0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: sgpr_spill_only:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    v_writelane_b32 v0, s68, 0
+; GISEL64-NEXT:    ;;#ASMSTART
+; GISEL64-NEXT:    ; clobber CSR SGPR
+; GISEL64-NEXT:    ;;#ASMEND
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s68, v0, 0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber CSR SGPR", "~{s68}"()
+  ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: multiple_blocks:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT:    s_mov_b32 s1, exec_lo
+; DAGISEL-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL-NEXT:  ; %bb.1: ; %if.then
+; DAGISEL-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL-NEXT:  ; %bb.2: ; %if.end
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: multiple_blocks:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GISEL-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; GISEL-NEXT:  ; %bb.1: ; %if.then
+; GISEL-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; GISEL-NEXT:  ; %bb.2: ; %if.end
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: multiple_blocks:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL64-NEXT:    s_mov_b64 s[2:3], exec
+; DAGISEL64-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL64-NEXT:  ; %bb.1: ; %if.then
+; DAGISEL64-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL64-NEXT:  ; %bb.2: ; %if.end
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: multiple_blocks:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL64-NEXT:    s_mov_b64 s[2:3], exec
+; GISEL64-NEXT:    v_cmpx_eq_u32_e64 v0, v1
+; GISEL64-NEXT:  ; %bb.1: ; %if.then
+; GISEL64-NEXT:    v_add_nc_u32_e32 v1, v0, v1
+; GISEL64-NEXT:  ; %bb.2: ; %if.end
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %c = icmp eq i32 %a, %b
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %0
+  %d = add i32 %a, %b
+  br label %if.end
+
+if.end:
+  %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+  %e = select i1 %active, i32 %a, i32 %f
+  ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+; DAGISEL-LABEL: ret_64:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0
+; DAGISEL-NEXT:    v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1
+; GISEL-NEXT:    v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: ret_64:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v2, 3, v2, vcc
+; DAGISEL64-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: ret_64:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v2, 3, v2, vcc
+; GISEL64-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT:    s_mov_b64 exec, vcc
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %x = select i1 %active, i64 %a, i64 5
+  %y = select i1 %active, i64 %b, i64 3
+  %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+  ret i64 %ret
+}
+
+define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) {
+; DAGISEL-LABEL: inreg_args:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9
+; DAGISEL-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; DAGISEL-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s10
+; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    scratch_store_b128 off, v[0:3], s11
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s11
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: inreg_args:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_xor_saveexec_b32 s34, -1
+; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    s_mov_b32 s0, s5
+; GISEL-NEXT:    s_mov_b32 s1, s6
+; GISEL-NEXT:    s_mov_b32 s2, s7
+; GISEL-NEXT:    s_mov_b32 s3, s8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL-NEXT:    scratch_store_b32 off, v4, s10
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    scratch_store_b128 off, v[0:3], s11
+; GISEL-NEXT:    scratch_store_b32 off, v5, s11
+; GISEL-NEXT:    s_xor_b32 exec_lo, s34, -1
+; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; GISEL-NEXT:    s_mov_b32 exec_lo, s34
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: inreg_args:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    v_mov_b32_e32 v4, s4
+; DAGISEL64-NEXT:    v_mov_b32_e32 v0, s5
+; DAGISEL64-NEXT:    v_mov_b32_e32 v1, s6
+; DAGISEL64-NEXT:    v_mov_b32_e32 v2, s7
+; DAGISEL64-NEXT:    v_mov_b32_e32 v3, s8
+; DAGISEL64-NEXT:    v_mov_b32_e32 v5, s9
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s10
+; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    scratch_store_b128 off, v[0:3], s11
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s11
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL64-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: inreg_args:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s32 offset:16
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s32 offset:20
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    s_mov_b32 s0, s5
+; GISEL64-NEXT:    s_mov_b32 s1, s6
+; GISEL64-NEXT:    s_mov_b32 s2, s7
+; GISEL64-NEXT:    s_mov_b32 s3, s8
+; GISEL64-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL64-NEXT:    v_mov_b32_e32 v1, s1
+; GISEL64-NEXT:    v_mov_b32_e32 v2, s2
+; GISEL64-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL64-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s10
+; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    scratch_store_b128 off, v[0:3], s11
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s11
+; GISEL64-NEXT:    s_xor_b64 exec, s[34:35], -1
+; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT:    scratch_load_b32 v4, off, s32 offset:16
+; GISEL64-NEXT:    scratch_load_b32 v5, off, s32 offset:20
+; GISEL64-NEXT:    s_mov_b64 exec, s[34:35]
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %i32, ptr addrspace(5) %ptr
+  store <4 x i32> %v4i32, ptr addrspace(5) %ptr2
+  store float %float, ptr addrspace(5) %ptr2
+  ret void
+}
+
+declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y)
+
+define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) {
+; DAGISEL-LABEL: call_gfx_from_whole_wave:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s0, s33
+; DAGISEL-NEXT:    s_mov_b32 s33, s32
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    v_writelane_b32 v40, s0, 3
+; DAGISEL-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL-NEXT:    v_swap_b32 v0, v1
+; DAGISEL-NEXT:    s_mov_b32 s1, gfx_callee at abs32@hi
+; DAGISEL-NEXT:    v_writelane_b32 v40, s4, 0
+; DAGISEL-NEXT:    s_mov_b32 s0, gfx_callee at abs32@lo
+; DAGISEL-NEXT:    s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT:    v_writelane_b32 v40, s30, 1
+; DAGISEL-NEXT:    v_writelane_b32 v40, s31, 2
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT:    v_readlane_b32 s31, v40, 2
+; DAGISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT:    v_readlane_b32 s4, v40, 0
+; DAGISEL-NEXT:    v_readlane_b32 s0, v40, 3
+; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT:    s_mov_b32 s32, s33
+; DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT:    s_mov_b32 s33, s0
+; DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_gfx_from_whole_wave:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s0, s33
+; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; GISEL-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; GISEL-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; GISEL-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; GISEL-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; GISEL-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; GISEL-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; GISEL-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; GISEL-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; GISEL-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; GISEL-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; GISEL-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; GISEL-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; GISEL-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; GISEL-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; GISEL-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; GISEL-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; GISEL-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; GISEL-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; GISEL-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; GISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; GISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; GISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; GISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; GISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; GISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; GISEL-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; GISEL-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; GISEL-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; GISEL-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; GISEL-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; GISEL-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; GISEL-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; GISEL-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; GISEL-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; GISEL-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; GISEL-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; GISEL-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; GISEL-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; GISEL-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; GISEL-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; GISEL-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; GISEL-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; GISEL-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; GISEL-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; GISEL-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; GISEL-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; GISEL-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; GISEL-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; GISEL-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; GISEL-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; GISEL-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; GISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; GISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; GISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; GISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; GISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; GISEL-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; GISEL-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; GISEL-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; GISEL-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; GISEL-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; GISEL-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; GISEL-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; GISEL-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; GISEL-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; GISEL-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; GISEL-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; GISEL-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; GISEL-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; GISEL-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; GISEL-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; GISEL-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; GISEL-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; GISEL-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; GISEL-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; GISEL-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; GISEL-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; GISEL-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; GISEL-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; GISEL-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; GISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; GISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; GISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; GISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; GISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; GISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; GISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; GISEL-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; GISEL-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; GISEL-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; GISEL-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; GISEL-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; GISEL-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; GISEL-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; GISEL-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; GISEL-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; GISEL-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; GISEL-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; GISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; GISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; GISEL-NEXT:    s_mov_b32 exec_lo, -1
+; GISEL-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    v_writelane_b32 v40, s0, 3
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL-NEXT:    v_swap_b32 v0, v1
+; GISEL-NEXT:    s_mov_b32 s0, gfx_callee at abs32@lo
+; GISEL-NEXT:    v_writelane_b32 v40, s4, 0
+; GISEL-NEXT:    s_mov_b32 s1, gfx_callee at abs32@hi
+; GISEL-NEXT:    s_addk_co_i32 s32, 0x250
+; GISEL-NEXT:    v_writelane_b32 v40, s30, 1
+; GISEL-NEXT:    v_writelane_b32 v40, s31, 2
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_readlane_b32 s31, v40, 2
+; GISEL-NEXT:    v_readlane_b32 s30, v40, 1
+; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
+; GISEL-NEXT:    v_readlane_b32 s0, v40, 3
+; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 s32, s33
+; GISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; GISEL-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; GISEL-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; GISEL-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; GISEL-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; GISEL-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; GISEL-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; GISEL-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; GISEL-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; GISEL-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; GISEL-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; GISEL-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; GISEL-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; GISEL-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; GISEL-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; GISEL-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; GISEL-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; GISEL-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; GISEL-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; GISEL-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; GISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; GISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; GISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; GISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; GISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; GISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; GISEL-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; GISEL-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; GISEL-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; GISEL-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; GISEL-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; GISEL-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; GISEL-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; GISEL-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; GISEL-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; GISEL-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; GISEL-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; GISEL-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; GISEL-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; GISEL-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; GISEL-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; GISEL-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; GISEL-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; GISEL-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; GISEL-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; GISEL-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; GISEL-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; GISEL-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; GISEL-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; GISEL-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; GISEL-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; GISEL-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; GISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; GISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; GISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; GISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; GISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; GISEL-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; GISEL-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; GISEL-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; GISEL-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; GISEL-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; GISEL-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; GISEL-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; GISEL-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; GISEL-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; GISEL-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; GISEL-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; GISEL-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; GISEL-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; GISEL-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; GISEL-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; GISEL-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; GISEL-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; GISEL-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; GISEL-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; GISEL-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; GISEL-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; GISEL-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; GISEL-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; GISEL-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; GISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; GISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; GISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; GISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; GISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; GISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; GISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; GISEL-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; GISEL-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; GISEL-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; GISEL-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; GISEL-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; GISEL-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; GISEL-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; GISEL-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; GISEL-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; GISEL-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; GISEL-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; GISEL-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; GISEL-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GISEL-NEXT:    s_mov_b32 s33, s0
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_gfx_from_whole_wave:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_mov_b32 s0, s33
+; DAGISEL64-NEXT:    s_mov_b32 s33, s32
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL64-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL64-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL64-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL64-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL64-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL64-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL64-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL64-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL64-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL64-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL64-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL64-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL64-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL64-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL64-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL64-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL64-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL64-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL64-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL64-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL64-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL64-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL64-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL64-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL64-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL64-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL64-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL64-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL64-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL64-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL64-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL64-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL64-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL64-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL64-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL64-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL64-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL64-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL64-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL64-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL64-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL64-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL64-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL64-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL64-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL64-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL64-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL64-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL64-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL64-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL64-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL64-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL64-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL64-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL64-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL64-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL64-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL64-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL64-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL64-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL64-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL64-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL64-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL64-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL64-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL64-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL64-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL64-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL64-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL64-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL64-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL64-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL64-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL64-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL64-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL64-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL64-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL64-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL64-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL64-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL64-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL64-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL64-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL64-NEXT:    s_mov_b64 exec, -1
+; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s0, 4
+; DAGISEL64-NEXT:    v_mov_b32_e32 v2, v0
+; DAGISEL64-NEXT:    v_swap_b32 v0, v1
+; DAGISEL64-NEXT:    s_mov_b32 s1, gfx_callee at abs32@hi
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s4, 0
+; DAGISEL64-NEXT:    s_mov_b32 s0, gfx_callee at abs32@lo
+; DAGISEL64-NEXT:    s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s5, 1
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s30, 2
+; DAGISEL64-NEXT:    v_writelane_b32 v40, s31, 3
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT:    v_readlane_b32 s31, v40, 3
+; DAGISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT:    v_readlane_b32 s5, v40, 1
+; DAGISEL64-NEXT:    v_readlane_b32 s4, v40, 0
+; DAGISEL64-NEXT:    v_readlane_b32 s0, v40, 4
+; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL64-NEXT:    s_mov_b32 s32, s33
+; DAGISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL64-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL64-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL64-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL64-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL64-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL64-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL64-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL64-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL64-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL64-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL64-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL64-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL64-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL64-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL64-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL64-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL64-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL64-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL64-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL64-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL64-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL64-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL64-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL64-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL64-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL64-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL64-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL64-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL64-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL64-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL64-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL64-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL64-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL64-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL64-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL64-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL64-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL64-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL64-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL64-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL64-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL64-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL64-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL64-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL64-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL64-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL64-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL64-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL64-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL64-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL64-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL64-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL64-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL64-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL64-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL64-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL64-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL64-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL64-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL64-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL64-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL64-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL64-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL64-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL64-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL64-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL64-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL64-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL64-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL64-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL64-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL64-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL64-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL64-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL64-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL64-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL64-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL64-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL64-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL64-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL64-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL64-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL64-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL64-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL64-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT:    s_mov_b32 s33, s0
+; DAGISEL64-NEXT:    s_wait_loadcnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_gfx_from_whole_wave:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_mov_b32 s0, s33
+; GISEL64-NEXT:    s_mov_b32 s33, s32
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT:    scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT:    scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT:    scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT:    scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT:    scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT:    scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT:    scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT:    scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT:    scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT:    scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT:    scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT:    scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT:    scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT:    scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT:    scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT:    scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT:    scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT:    scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT:    scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT:    scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT:    scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT:    scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT:    scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT:    scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT:    scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT:    scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT:    scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT:    scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT:    scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT:    scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT:    scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT:    scratch_store_b32 off, v48, s33 offset:164
+; GISEL64-NEXT:    scratch_store_b32 off, v49, s33 offset:168
+; GISEL64-NEXT:    scratch_store_b32 off, v50, s33 offset:172
+; GISEL64-NEXT:    scratch_store_b32 off, v51, s33 offset:176
+; GISEL64-NEXT:    scratch_store_b32 off, v52, s33 offset:180
+; GISEL64-NEXT:    scratch_store_b32 off, v53, s33 offset:184
+; GISEL64-NEXT:    scratch_store_b32 off, v54, s33 offset:188
+; GISEL64-NEXT:    scratch_store_b32 off, v55, s33 offset:192
+; GISEL64-NEXT:    scratch_store_b32 off, v64, s33 offset:196
+; GISEL64-NEXT:    scratch_store_b32 off, v65, s33 offset:200
+; GISEL64-NEXT:    scratch_store_b32 off, v66, s33 offset:204
+; GISEL64-NEXT:    scratch_store_b32 off, v67, s33 offset:208
+; GISEL64-NEXT:    scratch_store_b32 off, v68, s33 offset:212
+; GISEL64-NEXT:    scratch_store_b32 off, v69, s33 offset:216
+; GISEL64-NEXT:    scratch_store_b32 off, v70, s33 offset:220
+; GISEL64-NEXT:    scratch_store_b32 off, v71, s33 offset:224
+; GISEL64-NEXT:    scratch_store_b32 off, v80, s33 offset:228
+; GISEL64-NEXT:    scratch_store_b32 off, v81, s33 offset:232
+; GISEL64-NEXT:    scratch_store_b32 off, v82, s33 offset:236
+; GISEL64-NEXT:    scratch_store_b32 off, v83, s33 offset:240
+; GISEL64-NEXT:    scratch_store_b32 off, v84, s33 offset:244
+; GISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
+; GISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
+; GISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
+; GISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
+; GISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
+; GISEL64-NEXT:    scratch_store_b32 off, v99, s33 offset:272
+; GISEL64-NEXT:    scratch_store_b32 off, v100, s33 offset:276
+; GISEL64-NEXT:    scratch_store_b32 off, v101, s33 offset:280
+; GISEL64-NEXT:    scratch_store_b32 off, v102, s33 offset:284
+; GISEL64-NEXT:    scratch_store_b32 off, v103, s33 offset:288
+; GISEL64-NEXT:    scratch_store_b32 off, v112, s33 offset:292
+; GISEL64-NEXT:    scratch_store_b32 off, v113, s33 offset:296
+; GISEL64-NEXT:    scratch_store_b32 off, v114, s33 offset:300
+; GISEL64-NEXT:    scratch_store_b32 off, v115, s33 offset:304
+; GISEL64-NEXT:    scratch_store_b32 off, v116, s33 offset:308
+; GISEL64-NEXT:    scratch_store_b32 off, v117, s33 offset:312
+; GISEL64-NEXT:    scratch_store_b32 off, v118, s33 offset:316
+; GISEL64-NEXT:    scratch_store_b32 off, v119, s33 offset:320
+; GISEL64-NEXT:    scratch_store_b32 off, v128, s33 offset:324
+; GISEL64-NEXT:    scratch_store_b32 off, v129, s33 offset:328
+; GISEL64-NEXT:    scratch_store_b32 off, v130, s33 offset:332
+; GISEL64-NEXT:    scratch_store_b32 off, v131, s33 offset:336
+; GISEL64-NEXT:    scratch_store_b32 off, v132, s33 offset:340
+; GISEL64-NEXT:    scratch_store_b32 off, v133, s33 offset:344
+; GISEL64-NEXT:    scratch_store_b32 off, v134, s33 offset:348
+; GISEL64-NEXT:    scratch_store_b32 off, v135, s33 offset:352
+; GISEL64-NEXT:    scratch_store_b32 off, v144, s33 offset:356
+; GISEL64-NEXT:    scratch_store_b32 off, v145, s33 offset:360
+; GISEL64-NEXT:    scratch_store_b32 off, v146, s33 offset:364
+; GISEL64-NEXT:    scratch_store_b32 off, v147, s33 offset:368
+; GISEL64-NEXT:    scratch_store_b32 off, v148, s33 offset:372
+; GISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
+; GISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
+; GISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
+; GISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
+; GISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
+; GISEL64-NEXT:    scratch_store_b32 off, v163, s33 offset:400
+; GISEL64-NEXT:    scratch_store_b32 off, v164, s33 offset:404
+; GISEL64-NEXT:    scratch_store_b32 off, v165, s33 offset:408
+; GISEL64-NEXT:    scratch_store_b32 off, v166, s33 offset:412
+; GISEL64-NEXT:    scratch_store_b32 off, v167, s33 offset:416
+; GISEL64-NEXT:    scratch_store_b32 off, v176, s33 offset:420
+; GISEL64-NEXT:    scratch_store_b32 off, v177, s33 offset:424
+; GISEL64-NEXT:    scratch_store_b32 off, v178, s33 offset:428
+; GISEL64-NEXT:    scratch_store_b32 off, v179, s33 offset:432
+; GISEL64-NEXT:    scratch_store_b32 off, v180, s33 offset:436
+; GISEL64-NEXT:    scratch_store_b32 off, v181, s33 offset:440
+; GISEL64-NEXT:    scratch_store_b32 off, v182, s33 offset:444
+; GISEL64-NEXT:    scratch_store_b32 off, v183, s33 offset:448
+; GISEL64-NEXT:    scratch_store_b32 off, v192, s33 offset:452
+; GISEL64-NEXT:    scratch_store_b32 off, v193, s33 offset:456
+; GISEL64-NEXT:    scratch_store_b32 off, v194, s33 offset:460
+; GISEL64-NEXT:    scratch_store_b32 off, v195, s33 offset:464
+; GISEL64-NEXT:    scratch_store_b32 off, v196, s33 offset:468
+; GISEL64-NEXT:    scratch_store_b32 off, v197, s33 offset:472
+; GISEL64-NEXT:    scratch_store_b32 off, v198, s33 offset:476
+; GISEL64-NEXT:    scratch_store_b32 off, v199, s33 offset:480
+; GISEL64-NEXT:    scratch_store_b32 off, v208, s33 offset:484
+; GISEL64-NEXT:    scratch_store_b32 off, v209, s33 offset:488
+; GISEL64-NEXT:    scratch_store_b32 off, v210, s33 offset:492
+; GISEL64-NEXT:    scratch_store_b32 off, v211, s33 offset:496
+; GISEL64-NEXT:    scratch_store_b32 off, v212, s33 offset:500
+; GISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
+; GISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
+; GISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
+; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
+; GISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
+; GISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
+; GISEL64-NEXT:    scratch_store_b32 off, v227, s33 offset:528
+; GISEL64-NEXT:    scratch_store_b32 off, v228, s33 offset:532
+; GISEL64-NEXT:    scratch_store_b32 off, v229, s33 offset:536
+; GISEL64-NEXT:    scratch_store_b32 off, v230, s33 offset:540
+; GISEL64-NEXT:    scratch_store_b32 off, v231, s33 offset:544
+; GISEL64-NEXT:    scratch_store_b32 off, v240, s33 offset:548
+; GISEL64-NEXT:    scratch_store_b32 off, v241, s33 offset:552
+; GISEL64-NEXT:    scratch_store_b32 off, v242, s33 offset:556
+; GISEL64-NEXT:    scratch_store_b32 off, v243, s33 offset:560
+; GISEL64-NEXT:    scratch_store_b32 off, v244, s33 offset:564
+; GISEL64-NEXT:    scratch_store_b32 off, v245, s33 offset:568
+; GISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:572
+; GISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:576
+; GISEL64-NEXT:    s_mov_b64 exec, -1
+; GISEL64-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    v_writelane_b32 v40, s0, 4
+; GISEL64-NEXT:    v_mov_b32_e32 v2, v0
+; GISEL64-NEXT:    v_swap_b32 v0, v1
+; GISEL64-NEXT:    s_mov_b32 s0, gfx_callee at abs32@lo
+; GISEL64-NEXT:    v_writelane_b32 v40, s4, 0
+; GISEL64-NEXT:    s_mov_b32 s1, gfx_callee at abs32@hi
+; GISEL64-NEXT:    s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT:    v_writelane_b32 v40, s5, 1
+; GISEL64-NEXT:    v_writelane_b32 v40, s30, 2
+; GISEL64-NEXT:    v_writelane_b32 v40, s31, 3
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT:    v_readlane_b32 s31, v40, 3
+; GISEL64-NEXT:    v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT:    v_readlane_b32 s5, v40, 1
+; GISEL64-NEXT:    v_readlane_b32 s4, v40, 0
+; GISEL64-NEXT:    v_readlane_b32 s0, v40, 4
+; GISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL64-NEXT:    s_mov_b32 s32, s33
+; GISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT:    scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT:    scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT:    scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT:    scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT:    scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT:    scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT:    scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT:    scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT:    scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT:    scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT:    scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT:    scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT:    scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT:    scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT:    scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT:    scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT:    scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT:    scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT:    scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT:    scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT:    scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT:    scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT:    scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT:    scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT:    scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT:    scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT:    scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT:    scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT:    scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT:    scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT:    scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT:    scratch_load_b32 v48, off, s33 offset:164
+; GISEL64-NEXT:    scratch_load_b32 v49, off, s33 offset:168
+; GISEL64-NEXT:    scratch_load_b32 v50, off, s33 offset:172
+; GISEL64-NEXT:    scratch_load_b32 v51, off, s33 offset:176
+; GISEL64-NEXT:    scratch_load_b32 v52, off, s33 offset:180
+; GISEL64-NEXT:    scratch_load_b32 v53, off, s33 offset:184
+; GISEL64-NEXT:    scratch_load_b32 v54, off, s33 offset:188
+; GISEL64-NEXT:    scratch_load_b32 v55, off, s33 offset:192
+; GISEL64-NEXT:    scratch_load_b32 v64, off, s33 offset:196
+; GISEL64-NEXT:    scratch_load_b32 v65, off, s33 offset:200
+; GISEL64-NEXT:    scratch_load_b32 v66, off, s33 offset:204
+; GISEL64-NEXT:    scratch_load_b32 v67, off, s33 offset:208
+; GISEL64-NEXT:    scratch_load_b32 v68, off, s33 offset:212
+; GISEL64-NEXT:    scratch_load_b32 v69, off, s33 offset:216
+; GISEL64-NEXT:    scratch_load_b32 v70, off, s33 offset:220
+; GISEL64-NEXT:    scratch_load_b32 v71, off, s33 offset:224
+; GISEL64-NEXT:    scratch_load_b32 v80, off, s33 offset:228
+; GISEL64-NEXT:    scratch_load_b32 v81, off, s33 offset:232
+; GISEL64-NEXT:    scratch_load_b32 v82, off, s33 offset:236
+; GISEL64-NEXT:    scratch_load_b32 v83, off, s33 offset:240
+; GISEL64-NEXT:    scratch_load_b32 v84, off, s33 offset:244
+; GISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
+; GISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
+; GISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
+; GISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
+; GISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
+; GISEL64-NEXT:    scratch_load_b32 v99, off, s33 offset:272
+; GISEL64-NEXT:    scratch_load_b32 v100, off, s33 offset:276
+; GISEL64-NEXT:    scratch_load_b32 v101, off, s33 offset:280
+; GISEL64-NEXT:    scratch_load_b32 v102, off, s33 offset:284
+; GISEL64-NEXT:    scratch_load_b32 v103, off, s33 offset:288
+; GISEL64-NEXT:    scratch_load_b32 v112, off, s33 offset:292
+; GISEL64-NEXT:    scratch_load_b32 v113, off, s33 offset:296
+; GISEL64-NEXT:    scratch_load_b32 v114, off, s33 offset:300
+; GISEL64-NEXT:    scratch_load_b32 v115, off, s33 offset:304
+; GISEL64-NEXT:    scratch_load_b32 v116, off, s33 offset:308
+; GISEL64-NEXT:    scratch_load_b32 v117, off, s33 offset:312
+; GISEL64-NEXT:    scratch_load_b32 v118, off, s33 offset:316
+; GISEL64-NEXT:    scratch_load_b32 v119, off, s33 offset:320
+; GISEL64-NEXT:    scratch_load_b32 v128, off, s33 offset:324
+; GISEL64-NEXT:    scratch_load_b32 v129, off, s33 offset:328
+; GISEL64-NEXT:    scratch_load_b32 v130, off, s33 offset:332
+; GISEL64-NEXT:    scratch_load_b32 v131, off, s33 offset:336
+; GISEL64-NEXT:    scratch_load_b32 v132, off, s33 offset:340
+; GISEL64-NEXT:    scratch_load_b32 v133, off, s33 offset:344
+; GISEL64-NEXT:    scratch_load_b32 v134, off, s33 offset:348
+; GISEL64-NEXT:    scratch_load_b32 v135, off, s33 offset:352
+; GISEL64-NEXT:    scratch_load_b32 v144, off, s33 offset:356
+; GISEL64-NEXT:    scratch_load_b32 v145, off, s33 offset:360
+; GISEL64-NEXT:    scratch_load_b32 v146, off, s33 offset:364
+; GISEL64-NEXT:    scratch_load_b32 v147, off, s33 offset:368
+; GISEL64-NEXT:    scratch_load_b32 v148, off, s33 offset:372
+; GISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
+; GISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
+; GISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
+; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
+; GISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
+; GISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
+; GISEL64-NEXT:    scratch_load_b32 v163, off, s33 offset:400
+; GISEL64-NEXT:    scratch_load_b32 v164, off, s33 offset:404
+; GISEL64-NEXT:    scratch_load_b32 v165, off, s33 offset:408
+; GISEL64-NEXT:    scratch_load_b32 v166, off, s33 offset:412
+; GISEL64-NEXT:    scratch_load_b32 v167, off, s33 offset:416
+; GISEL64-NEXT:    scratch_load_b32 v176, off, s33 offset:420
+; GISEL64-NEXT:    scratch_load_b32 v177, off, s33 offset:424
+; GISEL64-NEXT:    scratch_load_b32 v178, off, s33 offset:428
+; GISEL64-NEXT:    scratch_load_b32 v179, off, s33 offset:432
+; GISEL64-NEXT:    scratch_load_b32 v180, off, s33 offset:436
+; GISEL64-NEXT:    scratch_load_b32 v181, off, s33 offset:440
+; GISEL64-NEXT:    scratch_load_b32 v182, off, s33 offset:444
+; GISEL64-NEXT:    scratch_load_b32 v183, off, s33 offset:448
+; GISEL64-NEXT:    scratch_load_b32 v192, off, s33 offset:452
+; GISEL64-NEXT:    scratch_load_b32 v193, off, s33 offset:456
+; GISEL64-NEXT:    scratch_load_b32 v194, off, s33 offset:460
+; GISEL64-NEXT:    scratch_load_b32 v195, off, s33 offset:464
+; GISEL64-NEXT:    scratch_load_b32 v196, off, s33 offset:468
+; GISEL64-NEXT:    scratch_load_b32 v197, off, s33 offset:472
+; GISEL64-NEXT:    scratch_load_b32 v198, off, s33 offset:476
+; GISEL64-NEXT:    scratch_load_b32 v199, off, s33 offset:480
+; GISEL64-NEXT:    scratch_load_b32 v208, off, s33 offset:484
+; GISEL64-NEXT:    scratch_load_b32 v209, off, s33 offset:488
+; GISEL64-NEXT:    scratch_load_b32 v210, off, s33 offset:492
+; GISEL64-NEXT:    scratch_load_b32 v211, off, s33 offset:496
+; GISEL64-NEXT:    scratch_load_b32 v212, off, s33 offset:500
+; GISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
+; GISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
+; GISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
+; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
+; GISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
+; GISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
+; GISEL64-NEXT:    scratch_load_b32 v227, off, s33 offset:528
+; GISEL64-NEXT:    scratch_load_b32 v228, off, s33 offset:532
+; GISEL64-NEXT:    scratch_load_b32 v229, off, s33 offset:536
+; GISEL64-NEXT:    scratch_load_b32 v230, off, s33 offset:540
+; GISEL64-NEXT:    scratch_load_b32 v231, off, s33 offset:544
+; GISEL64-NEXT:    scratch_load_b32 v240, off, s33 offset:548
+; GISEL64-NEXT:    scratch_load_b32 v241, off, s33 offset:552
+; GISEL64-NEXT:    scratch_load_b32 v242, off, s33 offset:556
+; GISEL64-NEXT:    scratch_load_b32 v243, off, s33 offset:560
+; GISEL64-NEXT:    scratch_load_b32 v244, off, s33 offset:564
+; GISEL64-NEXT:    scratch_load_b32 v245, off, s33 offset:568
+; GISEL64-NEXT:    scratch_load_b32 v246, off, s33 offset:572
+; GISEL64-NEXT:    scratch_load_b32 v247, off, s33 offset:576
+; GISEL64-NEXT:    s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT:    s_mov_b32 s33, s0
+; GISEL64-NEXT:    s_wait_loadcnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+  %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
+  ret <2 x half> %ret
+}

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index b514c49394d21..278cf0150c2f7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT:   isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
   entry:
@@ -315,6 +316,7 @@
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT:   isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
   entry:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index fc730f9e88454..890ea44081ce7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -46,6 +46,7 @@
 ; AFTER-PEI-NEXT: hasInitWholeWave: false
 ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
+; AFTER-PEI-NEXT: isWholeWaveFunction: false
 ; AFTER-PEI-NEXT: body:
 define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
   %wide.sgpr0 = call <32 x i32>  asm sideeffect "; def $0", "=s" () #0

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 5adef1433079d..f84ef8a3844dd 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
   bb0:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index fa40164aa02f0..cc834d017c149 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -46,6 +46,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
 bb0:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 24565e4423d04..06c580ec6f6b4 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -55,6 +55,7 @@
 # FULL-NEXT:  hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -162,6 +163,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -240,6 +242,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:
@@ -319,6 +322,7 @@ body:             |
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
 # FULL-NEXT: body:
 
 # SIMPLE: machineFunctionInfo:

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index a15271382f37d..427154651a381 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -56,6 +56,7 @@
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
   %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
   %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define void @function() {
   ret void
@@ -233,6 +236,7 @@ define void @function() {
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {
   ret void

diff  --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll
index aec09771d2e4f..e86825e088753 100644
--- a/llvm/test/Verifier/amdgpu-cc.ll
+++ b/llvm/test/Verifier/amdgpu-cc.ll
@@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p
 define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) {
   ret void
 }
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_no_args
+define amdgpu_gfx_whole_wave void @whole_wave_no_args() {
+  ret void
+}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_must_have_i1_active
+define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) {
+  ret void
+}
+
+; CHECK: Calling convention requires first argument to not be inreg
+; CHECK-NEXT: ptr @whole_wave_i1_active_inreg
+define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs
+; CHECK-NEXT: ptr @whole_wave_varargs
+define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) {
+  ret void
+}
+
+declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active)
+
+; CHECK: calling convention does not permit calls
+; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+define amdgpu_cs void @cant_call_whole_wave_func() {
+  call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+  ret void
+}


        


More information about the llvm-commits mailing list