[llvm] 20d8398 - [AMDGPU] ISel & PEI for whole wave functions (#145858)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 01:39:13 PDT 2025
Author: Diana Picus
Date: 2025-07-21T10:39:09+02:00
New Revision: 20d8398825a799008ae508d8463dbb9b11df81e7
URL: https://github.com/llvm/llvm-project/commit/20d8398825a799008ae508d8463dbb9b11df81e7
DIFF: https://github.com/llvm/llvm-project/commit/20d8398825a799008ae508d8463dbb9b11df81e7.diff
LOG: [AMDGPU] ISel & PEI for whole wave functions (#145858)
Whole wave functions are functions that will run with a full EXEC mask.
They will not be invoked directly, but instead will be launched by way
of a new intrinsic, `llvm.amdgcn.call.whole.wave` (to be added in
a future patch). These functions are meant as an alternative to the
`llvm.amdgcn.init.whole.wave` or `llvm.amdgcn.strict.wwm` intrinsics.
Whole wave functions will set EXEC to -1 in the prologue and restore the
original value of EXEC in the epilogue. They must have a special first
argument, `i1 %active`, that is going to be mapped to EXEC. They may
have either the default calling convention or amdgpu_gfx. The inactive
lanes need to be preserved for all registers used, active lanes only for
the CSRs.
At the IR level, arguments to a whole wave function (other than
`%active`) contain poison in their inactive lanes. Likewise, the return
value for the inactive lanes is poison.
This patch contains the following work:
* 2 new pseudos, SI_SETUP_WHOLE_WAVE_FUNC and SI_WHOLE_WAVE_FUNC_RETURN
used for managing the EXEC mask. SI_SETUP_WHOLE_WAVE_FUNC will return
a SReg_1 representing `%active`, which needs to be passed into
SI_WHOLE_WAVE_FUNC_RETURN.
* SelectionDAG support for generating these 2 new pseudos and the
special handling of %active. Since the return may be in a different
basic block, it's difficult to add the virtual reg for %active to
SI_WHOLE_WAVE_FUNC_RETURN, so we initially generate an IMPLICIT_DEF
which is later replaced via a custom inserter.
* Expansion of the 2 pseudos during prolog/epilog insertion. PEI also
marks any used VGPRs as WWM registers, which are then spilled and
restored with the usual logic.
Future patches will include the `llvm.amdgcn.call.whole.wave` intrinsic
and a lot of optimization work (especially in order to reduce spills
around function calls).
---------
Co-authored-by: Matt Arsenault <Matthew.Arsenault at amd.com>
Co-authored-by: Shilei Tian <i at tianshilei.me>
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/AsmParser/LLToken.h
llvm/include/llvm/IR/CallingConv.h
llvm/lib/AsmParser/LLLexer.cpp
llvm/lib/AsmParser/LLParser.cpp
llvm/lib/IR/AsmWriter.cpp
llvm/lib/IR/Function.cpp
llvm/lib/IR/Verifier.cpp
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
llvm/test/Bitcode/compatibility.ll
llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
llvm/test/Verifier/amdgpu-cc.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c5b9bd9de66e1..19357635ecfc1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions:
..TODO::
Describe.
+ ``amdgpu_gfx_whole_wave`` Used for AMD graphics targets. Functions with this calling convention
+ cannot be used as entry points. They must have an i1 as the first argument,
+ which will be mapped to the value of EXEC on entry into the function. Other
+ arguments will contain poison in their inactive lanes. Similarly, the return
+ value for the inactive lanes is poison.
+
+ The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
+ prologue and restored to its original value in the epilogue. The inactive lanes
+ will be preserved for all the registers used by the function. Active lanes only
+ will only be preserved for the callee saved registers.
+
+ In all other respects, functions with this calling convention behave like
+ ``amdgpu_gfx`` functions.
+
``amdgpu_gs`` Used for Mesa/AMDPAL geometry shaders.
..TODO::
Describe.
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index c7e4bdf3ff811..a2311d2ac285d 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -181,6 +181,7 @@ enum Kind {
kw_amdgpu_cs_chain_preserve,
kw_amdgpu_kernel,
kw_amdgpu_gfx,
+ kw_amdgpu_gfx_whole_wave,
kw_tailcc,
kw_m68k_rtdcc,
kw_graalcc,
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index d68491eb5535c..ef761eb1aed73 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -284,6 +284,9 @@ namespace CallingConv {
RISCV_VLSCall_32768 = 122,
RISCV_VLSCall_65536 = 123,
+ // Calling convention for AMDGPU whole wave functions.
+ AMDGPU_Gfx_WholeWave = 124,
+
/// The highest possible ID. Must be some 2^k - 1.
MaxID = 1023
};
@@ -294,8 +297,13 @@ namespace CallingConv {
/// directly or indirectly via a call-like instruction.
constexpr bool isCallableCC(CallingConv::ID CC) {
switch (CC) {
+ // Called with special intrinsics:
+ // llvm.amdgcn.cs.chain
case CallingConv::AMDGPU_CS_Chain:
case CallingConv::AMDGPU_CS_ChainPreserve:
+ // llvm.amdgcn.call.whole.wave
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ // Hardware entry points:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ce813e1d7b1c4..520c6a00a9c07 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_cs_chain_preserve);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
+ KEYWORD(amdgpu_gfx_whole_wave);
KEYWORD(tailcc);
KEYWORD(m68k_rtdcc);
KEYWORD(graalcc);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7f6950f679ef..00277757c0955 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
CC = CallingConv::AMDGPU_CS_ChainPreserve;
break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
+ case lltok::kw_amdgpu_gfx_whole_wave:
+ CC = CallingConv::AMDGPU_Gfx_WholeWave;
+ break;
case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break;
case lltok::kw_graalcc: CC = CallingConv::GRAAL; break;
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 145ef10f28f35..3e40915b6a920 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Out << "amdgpu_gfx_whole_wave";
+ break;
case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break;
case CallingConv::RISCV_VectorCall:
Out << "riscv_vector_cc";
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 7a03663e129dc..fc067459dcba3 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::WASM_EmscriptenInvoke:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
case CallingConv::M68k_INTR:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9bd573e773610..e7b491e76724e 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2979,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
"perfect forwarding!",
&F);
break;
+ case CallingConv::AMDGPU_Gfx_WholeWave:
+ Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
+ "Calling convention requires first argument to be i1", &F);
+ Check(!F.arg_begin()->hasInRegAttr(),
+ "Calling convention requires first argument to not be inreg", &F);
+ Check(!F.isVarArg(),
+ "Calling convention does not support varargs or "
+ "perfect forwarding!",
+ &F);
+ break;
}
// Check that the argument values match the function type for this function...
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 14101e57f5143..3d8d274f06246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -374,8 +374,10 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}
- unsigned ReturnOpc =
- IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
+ const bool IsWholeWave = MFI->isWholeWaveFunction();
+ unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
+ : IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
+ : AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);
if (!FLI.CanLowerReturn)
@@ -383,6 +385,9 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
+ if (IsWholeWave)
+ addOriginalExecToReturn(B.getMF(), Ret);
+
// TODO: Handle CalleeSavedRegsViaCopy.
B.insertInstr(Ret);
@@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;
+ if (Info->isWholeWaveFunction() && Idx == 0) {
+ assert(VRegs[Idx].size() == 1 && "Expected only one register");
+
+ // The first argument for whole wave functions is the original EXEC value.
+ B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ .addDef(VRegs[Idx][0]);
+
+ ++Idx;
+ continue;
+ }
+
const bool InReg = Arg.hasAttribute(Attribute::InReg);
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
@@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
@@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return true;
}
+
+void AMDGPUCallLowering::addOriginalExecToReturn(
+ MachineFunction &MF, MachineInstrBuilder &Ret) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
+ Ret.addReg(Setup->getOperand(0).getReg());
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index a6e801f2a547b..e0033d59d10bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+ void addOriginalExecToReturn(MachineFunction &MF,
+ MachineInstrBuilder &Ret) const;
+
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2bfd56f9f3554..891d362503f15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -315,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
+def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
+// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
+// so we don't mark it as equivalent.
+
class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d040fb705a8d..b037cdd5393ea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1143,6 +1143,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
@@ -1168,6 +1169,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
@@ -5875,6 +5877,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
+ NODE_NAME_CASE(WHOLE_WAVE_SETUP)
+ NODE_NAME_CASE(WHOLE_WAVE_RETURN)
}
return nullptr;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4e8c6c7ea3b27..39bb0adfc1a17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -608,6 +608,12 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,
+
+ // Set up a whole wave function.
+ WHOLE_WAVE_SETUP,
+
+ // Return from a whole wave function.
+ WHOLE_WAVE_RETURN,
};
} // End namespace AMDGPUISD
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ce58e93a15207..e305f08925cc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+// Marks the entry into a whole wave function.
+def AMDGPUwhole_wave_setup : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+// Marks the return from a whole wave function.
+def AMDGPUwhole_wave_return : SDNode<
+ "AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d161c035ac295..8975486caa770 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4160,6 +4160,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
+ I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
+ return true;
+ }
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bf2f37bddb9ed..b54cccead9781 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5540,6 +5540,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
break;
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
+ case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bbed828b4fed3..c4a3be44fc72d 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3206,7 +3206,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);
auto NextMI = std::next(It);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6a3867937d57f..11552b3a9a438 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
- ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ if (FuncInfo->isWholeWaveFunction()) {
+ // Whole wave functions already have a copy of the original EXEC mask that
+ // we can use.
+ assert(IsProlog && "Epilog should look at return, not setup");
+ ScratchExecCopy =
+ TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
+ assert(ScratchExecCopy && "Couldn't find copy of EXEC");
+ } else {
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveUnits, *TRI.getWaveMaskRegClass());
+ }
+
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
@@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores(
};
StoreWWMRegisters(WWMScratchRegs);
+
+ auto EnableAllLanes = [&]() {
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ };
+
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
+ EnableAllLanes();
} else {
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
@@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores(
}
StoreWWMRegisters(WWMCalleeSavedRegs);
- if (ScratchExecCopy) {
+ if (FuncInfo->isWholeWaveFunction()) {
+ // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
+ // it now. If we have already saved some WWM CSR registers, then the EXEC is
+ // already -1 and we don't need to do anything else. Otherwise, set EXEC to
+ // -1 here.
+ if (!ScratchExecCopy)
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
+ /*EnableInactiveLanes*/ true);
+ else if (WWMCalleeSavedRegs.empty())
+ EnableAllLanes();
+ TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+ } else if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
@@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores(
Register ScratchExecCopy;
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
- if (!WWMScratchRegs.empty())
- ScratchExecCopy =
- buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
- /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
-
auto RestoreWWMRegisters =
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
for (const auto &Reg : WWMRegs) {
@@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores(
}
};
+ if (FuncInfo->isWholeWaveFunction()) {
+ // For whole wave functions, the EXEC is already -1 at this point.
+ // Therefore, we can restore the CSR WWM registers right away.
+ RestoreWWMRegisters(WWMCalleeSavedRegs);
+
+ // The original EXEC is the first operand of the return instruction.
+ const MachineInstr &Return = MBB.instr_back();
+ assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
+ "Unexpected return inst");
+ Register OrigExec = Return.getOperand(0).getReg();
+
+ if (!WWMScratchRegs.empty()) {
+ unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
+ .addReg(OrigExec)
+ .addImm(-1);
+ RestoreWWMRegisters(WWMScratchRegs);
+ }
+
+ // Restore original EXEC.
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+ return;
+ }
+
+ if (!WWMScratchRegs.empty()) {
+ ScratchExecCopy =
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
+ /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
+ }
RestoreWWMRegisters(WWMScratchRegs);
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
@@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
(MFI->isChainFunction() &&
TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
@@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isEntryFunction())
return;
+ if (MFI->isWholeWaveFunction()) {
+ // In practice, all the VGPRs are WWM registers, and we will need to save at
+ // least their inactive lanes. Add them to WWMReservedRegs.
+ assert(!NeedExecCopyReservedReg &&
+ "Whole wave functions can use the reg mapped for their i1 argument");
+
+ // FIXME: Be more efficient!
+ for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
+ if (MF.getRegInfo().isPhysRegModified(Reg)) {
+ MFI->reserveWWMRegister(Reg);
+ MF.begin()->addLiveIn(Reg);
+ }
+ MF.begin()->sortUniqueLiveIns();
+ }
+
// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2ec5ea7..d4e3fa71ada85 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2260,7 +2260,8 @@ SDValue SITargetLowering::getPreloadedValue(
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
if (Subtarget->hasArchitectedSGPRs() &&
- (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+ (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
switch (PVID) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Reg = &WorkGroupIDX;
@@ -2942,12 +2943,15 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!Subtarget->enableFlatScratch())
assert(!UserSGPRInfo.hasFlatScratchInit());
if ((CallConv != CallingConv::AMDGPU_CS &&
- CallConv != CallingConv::AMDGPU_Gfx) ||
+ CallConv != CallingConv::AMDGPU_Gfx &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
!Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
}
+ bool IsWholeWaveFunc = Info->isWholeWaveFunction();
+
if (CallConv == CallingConv::AMDGPU_PS) {
processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -2988,7 +2992,8 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- Splits.append(Ins.begin(), Ins.end());
+ Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
+ Ins.end());
}
if (IsKernel)
@@ -3019,6 +3024,13 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
+ if (IsWholeWaveFunc) {
+ SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
+ {MVT::i1, MVT::Other}, Chain);
+ InVals.push_back(Setup.getValue(0));
+ Chains.push_back(Setup.getValue(1));
+ }
+
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
@@ -3026,7 +3038,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
+ ++i) {
const ISD::InputArg &Arg = Ins[i];
if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
InVals.push_back(DAG.getPOISON(Arg.VT));
@@ -3374,7 +3387,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
+ Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
+ : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
+ : AMDGPUISD::RET_GLUE;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -3876,7 +3891,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
+ CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -5890,6 +5906,18 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
+ assert(MFI->isWholeWaveFunction());
+
+ // During ISel, it's
diff icult to propagate the original EXEC mask to use as
+ // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
+ MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
+ Register OriginalExec = Setup->getOperand(0).getReg();
+ assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
+ MF->getRegInfo().clearKillFlags(OriginalExec);
+ MI.getOperand(0).setReg(OriginalExec);
+ return BB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2af0a575a8885..9faf4974e3fd6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1812,6 +1812,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::SI_RETURN ||
+ MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c8935f0cb6034..e2a2525d909bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2472,6 +2472,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -5757,6 +5758,19 @@ void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
}
+MachineInstr *
+SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
+ assert(MF.getInfo<SIMachineFunctionInfo>()->isWholeWaveFunction() &&
+ "Not a whole wave func");
+ MachineBasicBlock &MBB = *MF.begin();
+ for (MachineInstr &MI : MBB)
+ if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
+ MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
+ return &MI;
+
+ llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
+}
+
static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
const MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5e92921f3ea21..800ea9ab50b85 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1215,6 +1215,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
Register Reg, SlotIndexes *Indexes = nullptr) const;
+ MachineInstr *getWholeWaveFunctionSetup(MachineFunction &MF) const;
+
/// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 991d9f83e92e4..2230a431a0f26 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -644,6 +644,32 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
let isConvergent = 1;
}
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_WHOLE_WAVE_FUNC_SETUP : SPseudoInstSI <
+ (outs SReg_1:$dst), (ins), [(set i1:$dst, (AMDGPUwhole_wave_setup))]> {
+ let Defs = [EXEC];
+ let Uses = [EXEC];
+
+ let isConvergent = 1;
+}
+
+// Restores the previous EXEC and otherwise behaves entirely like a SI_RETURN.
+def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
+ (outs), (ins SReg_1:$orig_exec)> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_WHOLE_WAVE_FUNC_RETURN pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -4300,6 +4326,20 @@ def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_WHOLE_WAVE_FUNC_SETUP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$origExec);
+ let InOperandList = (ins);
+ let isConvergent = 1;
+}
+
+def G_AMDGPU_WHOLE_WAVE_FUNC_RETURN : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$origExec);
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+}
+
// This is equivalent to the G_INTRINSIC*, but the operands may have
// been legalized depending on the subtarget requirements.
def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8c2e9b620ad16..f0be204cd9bdb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -51,7 +51,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
+ IsWholeWaveFunction(F.getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave) {
const GCNSubtarget &ST = *STI;
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -99,7 +101,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ImplicitArgPtr = false;
} else if (!isEntryFunction()) {
- if (CC != CallingConv::AMDGPU_Gfx)
+ if (CC != CallingConv::AMDGPU_Gfx &&
+ CC != CallingConv::AMDGPU_Gfx_WholeWave)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
FrameOffsetReg = AMDGPU::SGPR33;
@@ -732,6 +735,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ IsWholeWaveFunction(MFI.isWholeWaveFunction()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
@@ -778,6 +782,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..08b0206d244fb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue LongBranchReservedReg;
bool HasInitWholeWave = false;
+ bool IsWholeWaveFunction = false;
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -356,6 +357,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
+ YamlIO.mapOptional("isWholeWaveFunction", MFI.IsWholeWaveFunction, false);
}
};
@@ -565,6 +567,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// the serialization easier.
ReservedRegSet WWMReservedRegs;
+ bool IsWholeWaveFunction = false;
+
using PrologEpilogSGPRSpill =
std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
// To track the SGPR spill method used for a CSR SGPR register during
@@ -670,6 +674,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
return WWMReservedRegs.contains(Reg);
}
+ bool isWholeWaveFunction() const { return IsWholeWaveFunction; }
+
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
return PrologEpilogSGPRSpills;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fa2b8db6ba55a..84cfa878276fd 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -407,6 +407,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
: CSR_AMDGPU_SaveList;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
: CSR_AMDGPU_SI_Gfx_SaveList;
case CallingConv::AMDGPU_CS_ChainPreserve:
@@ -433,6 +434,7 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
: CSR_AMDGPU_RegMask;
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
: CSR_AMDGPU_SI_Gfx_RegMask;
case CallingConv::AMDGPU_CS_Chain:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index c9d2c286bf237..2d344f41ff790 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1423,7 +1423,8 @@ constexpr bool isShader(CallingConv::ID CC) {
LLVM_READNONE
constexpr bool isGraphics(CallingConv::ID CC) {
- return isShader(CC) || CC == CallingConv::AMDGPU_Gfx;
+ return isShader(CC) || CC == CallingConv::AMDGPU_Gfx ||
+ CC == CallingConv::AMDGPU_Gfx_WholeWave;
}
LLVM_READNONE
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index e464470143e52..fd6253daa327a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -44,6 +44,7 @@ static const char *getStageName(CallingConv::ID CC) {
case CallingConv::AMDGPU_LS:
return ".ls";
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
llvm_unreachable("Callable shader has no hardware stage");
default:
return ".cs";
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 9cf3fdbe550b4..0b5ce08c00a23 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -564,6 +564,10 @@ declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
; CHECK: declare riscv_vls_cc(32768) void @riscv_vls_cc_32768()
declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
; CHECK: declare riscv_vls_cc(65536) void @riscv_vls_cc_65536()
+declare cc124 void @f.cc124(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.cc124(i1)
+declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
+; CHECK: declare amdgpu_gfx_whole_wave void @f.amdgpu_gfx_whole_wave(i1)
declare cc1023 void @f.cc1023()
; CHECK: declare cc1023 void @f.cc1023()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
new file mode 100644
index 0000000000000..beca901945753
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+---
+name: basic_test
+legalized: true
+machineFunctionInfo:
+ isWholeWaveFunction: true
+body: |
+ bb.1:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: basic_test
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+ ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %1:_(s32) = COPY $vgpr0
+ %2:_(s32) = COPY $vgpr1
+ %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ %12:_(s32) = G_CONSTANT i32 5
+ %11:_(s32) = G_SELECT %0(s1), %1, %12
+ %14:_(s32) = G_CONSTANT i32 3
+ %13:_(s32) = G_SELECT %0(s1), %2, %14
+ %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0
+ $vgpr0 = COPY %15(s32)
+ G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
new file mode 100644
index 0000000000000..b68786b579dd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: basic_test
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]]
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0
+ ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: unused_active
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+ ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+ ; CHECK-LABEL: name: multiple_blocks
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+ ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1)
+ ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2
+ ; CHECK-NEXT: G_BR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.end:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2
+ ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+ ; CHECK-LABEL: name: ret_64
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]]
+ ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]]
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64)
+ ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
new file mode 100644
index 0000000000000..3450d63ff7b4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s
+
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: basic_test
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: basic_test
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: unused_active
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: unused_active
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+ ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ret i32 14
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+ ; DAGISEL-LABEL: name: multiple_blocks
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+ ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; DAGISEL-NEXT: S_BRANCH %bb.1
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: bb.1.if.then:
+ ; DAGISEL-NEXT: successors: %bb.2(0x80000000)
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: bb.2.if.end:
+ ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1
+ ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]]
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: multiple_blocks
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GISEL-NEXT: S_BRANCH %bb.2
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: bb.2.if.then:
+ ; GISEL-NEXT: successors: %bb.3(0x80000000)
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: bb.3.if.end:
+ ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2
+ ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+ ; DAGISEL-LABEL: name: ret_64
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
+ ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+ ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+ ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GISEL-LABEL: name: ret_64
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
+ ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]]
+ ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
new file mode 100644
index 0000000000000..93f489170cea0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir
@@ -0,0 +1,448 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s
+
+---
+name: save_inactive_lanes_non_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: save_all_lanes_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: save_all_lanes_csr_vgpr
+ ; CHECK: liveins: $vgpr40
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr40 = V_MOV_B32_e32 14, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0
+
+...
+---
+name: save_csr_sgpr_to_non_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr191
+ ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr
+ ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+ ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr20 = S_MOV_B32 14, implicit $exec
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: save_csr_sgpr_to_csr_vgpr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr191
+ ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr
+ ; CHECK: liveins: $sgpr20, $vgpr191
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr20 = S_MOV_B32 14, implicit $exec
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: vgpr_and_sgpr_csr
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 4
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ hasSpilledSGPRs: true
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ spillPhysVGPRs:
+ - '$vgpr191'
+ wwmReservedRegs:
+ - '$vgpr191'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+ ; CHECK-LABEL: name: vgpr_and_sgpr_csr
+ ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo
+
+...
+---
+name: split_orig_exec
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 4
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ hasSpilledSGPRs: true
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ spillPhysVGPRs:
+ - '$vgpr191'
+ wwmReservedRegs:
+ - '$vgpr191'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191
+
+ ; CHECK-LABEL: name: split_orig_exec
+ ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+ $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20
+ $sgpr3 = COPY $vcc_lo
+ S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40
+ $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3
+
+...
+---
+name: vgpr_superregs
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: vgpr_superregs
+ ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+ ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+ ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+ ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+ ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 14, implicit $exec
+ S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: dont_restore_used_vgprs
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr20' }
+ - { reg: '$vgpr40' }
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr20, $vgpr40
+
+ ; CHECK-LABEL: name: dont_restore_used_vgprs
+ ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+ renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0
+
+...
+---
+name: multiple_blocks
+alignment: 1
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+tracksDebugUserValues: true
+liveins:
+ - { reg: '$vgpr0' }
+ - { reg: '$vgpr1' }
+frameInfo:
+ maxAlignment: 1
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ returnsVoid: false
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr105'
+ isWholeWaveFunction: true
+body: |
+ ; CHECK-LABEL: name: multiple_blocks
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1
+ ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+ ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+ ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc
+ ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo
+ ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+ bb.0:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0, $vgpr1
+
+ renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
+ $sgpr1 = S_MOV_B32 $exec_lo
+ V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+ renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec
+
+ bb.2:
+ liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1
+
+ $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc
+ renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec
+ SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
new file mode 100644
index 0000000000000..53d02925fb1c2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -0,0 +1,2414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s
+
+; Make sure the i1 %active is passed through EXEC.
+; The EXEC mask should be set to -1 for the duration of the function
+; and restored to its original value in the epilogue.
+; We will also need to restore the inactive lanes for any allocated VGPRs.
+define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: basic_test:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: basic_test:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if there's only one use for %active.
+define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: single_use_of_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: single_use_of_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: single_use_of_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: single_use_of_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %y = select i1 %active, i32 %b, i32 17
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Make sure we don't crash if %active is not used at all.
+define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: unused_active:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: unused_active:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_mov_b32_e32 v0, 14
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: unused_active:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: unused_active:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_mov_b32_e32 v0, 14
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ ret i32 14
+}
+
+; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes.
+; For CSR VGPRs, we need to restore all lanes.
+define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber non-CSR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xf1ff
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v2, s32
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber non-CSR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1
+; GISEL-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v2, off, s32
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xf1ff
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber non-CSR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xf1ff
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: v_writelane_b32 v2, s20, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber non-CSR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GISEL64-NEXT: v_readlane_b32 s20, v2, 0
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xf1ff
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i32 %a, i32 5
+ %y = select i1 %active, i32 %b, i32 3
+ call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"()
+ call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"()
+ %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i32 %ret
+}
+
+; Save and restore all lanes of v40.
+define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: csr_vgpr_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR VGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: csr_vgpr_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR VGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: csr_vgpr_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR VGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: csr_vgpr_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR VGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR VGPR", "~{v40}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: sgpr_spill_only:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: ; clobber CSR SGPR
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sgpr_spill_only:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; clobber CSR SGPR
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: sgpr_spill_only:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; DAGISEL64-NEXT: ;;#ASMSTART
+; DAGISEL64-NEXT: ; clobber CSR SGPR
+; DAGISEL64-NEXT: ;;#ASMEND
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: sgpr_spill_only:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: v_writelane_b32 v0, s68, 0
+; GISEL64-NEXT: ;;#ASMSTART
+; GISEL64-NEXT: ; clobber CSR SGPR
+; GISEL64-NEXT: ;;#ASMEND
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s68, v0, 0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; clobber CSR SGPR", "~{s68}"()
+ ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
+; DAGISEL-LABEL: multiple_blocks:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL-NEXT: ; %bb.1: ; %if.then
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL-NEXT: ; %bb.2: ; %if.end
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: multiple_blocks:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL-NEXT: ; %bb.1: ; %if.then
+; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL-NEXT: ; %bb.2: ; %if.end
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: multiple_blocks:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec
+; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; DAGISEL64-NEXT: ; %bb.1: ; %if.then
+; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; DAGISEL64-NEXT: ; %bb.2: ; %if.end
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: multiple_blocks:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL64-NEXT: s_mov_b64 s[2:3], exec
+; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1
+; GISEL64-NEXT: ; %bb.1: ; %if.then
+; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1
+; GISEL64-NEXT: ; %bb.2: ; %if.end
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %c = icmp eq i32 %a, %b
+ br i1 %c, label %if.then, label %if.end
+
+if.then: ; preds = %0
+ %d = add i32 %a, %b
+ br label %if.end
+
+if.end:
+ %f = phi i32 [ %d, %if.then ], [ %b, %0 ]
+ %e = select i1 %active, i32 %a, i32 %f
+ ret i32 %e
+}
+
+define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
+; DAGISEL-LABEL: ret_64:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0
+; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; DAGISEL-NEXT: s_clause 0x3
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1
+; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: ret_64:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; DAGISEL64-NEXT: s_clause 0x3
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: s_mov_b64 exec, vcc
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: ret_64:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GISEL64-NEXT: s_xor_b64 exec, vcc, -1
+; GISEL64-NEXT: s_clause 0x3
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: s_mov_b64 exec, vcc
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %x = select i1 %active, i64 %a, i64 5
+ %y = select i1 %active, i64 %b, i64 3
+ %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
+ ret i64 %ret
+}
+
+define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) {
+; DAGISEL-LABEL: inreg_args:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9
+; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1
+; DAGISEL-NEXT: s_clause 0x5
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: inreg_args:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_xor_saveexec_b32 s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_store_b32 off, v0, s32
+; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_mov_b32 s0, s5
+; GISEL-NEXT: s_mov_b32 s1, s6
+; GISEL-NEXT: s_mov_b32 s2, s7
+; GISEL-NEXT: s_mov_b32 s3, s8
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v5, s9
+; GISEL-NEXT: scratch_store_b32 off, v4, s10
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL-NEXT: scratch_store_b32 off, v5, s11
+; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1
+; GISEL-NEXT: s_clause 0x5
+; GISEL-NEXT: scratch_load_b32 v0, off, s32
+; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL-NEXT: s_mov_b32 exec_lo, s34
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: inreg_args:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4
+; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5
+; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7
+; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8
+; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10
+; DAGISEL64-NEXT: s_clause 0x1
+; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1
+; DAGISEL64-NEXT: s_clause 0x5
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: inreg_args:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_store_b32 off, v0, s32
+; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_mov_b32 s0, s5
+; GISEL64-NEXT: s_mov_b32 s1, s6
+; GISEL64-NEXT: s_mov_b32 s2, s7
+; GISEL64-NEXT: s_mov_b32 s3, s8
+; GISEL64-NEXT: v_mov_b32_e32 v4, s4
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_mov_b32_e32 v0, s0
+; GISEL64-NEXT: v_mov_b32_e32 v1, s1
+; GISEL64-NEXT: v_mov_b32_e32 v2, s2
+; GISEL64-NEXT: v_mov_b32_e32 v3, s3
+; GISEL64-NEXT: v_mov_b32_e32 v5, s9
+; GISEL64-NEXT: scratch_store_b32 off, v4, s10
+; GISEL64-NEXT: s_clause 0x1
+; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11
+; GISEL64-NEXT: scratch_store_b32 off, v5, s11
+; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1
+; GISEL64-NEXT: s_clause 0x5
+; GISEL64-NEXT: scratch_load_b32 v0, off, s32
+; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4
+; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8
+; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12
+; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16
+; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20
+; GISEL64-NEXT: s_mov_b64 exec, s[34:35]
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ store i32 %i32, ptr addrspace(5) %ptr
+ store <4 x i32> %v4i32, ptr addrspace(5) %ptr2
+ store float %float, ptr addrspace(5) %ptr2
+ ret void
+}
+
+declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y)
+
+define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) {
+; DAGISEL-LABEL: call_gfx_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3
+; DAGISEL-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL-NEXT: v_swap_b32 v0, v1
+; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
+; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0
+; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
+; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1
+; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2
+; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0
+; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_gfx_from_whole_wave:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_writelane_b32 v40, s0, 3
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GISEL-NEXT: v_swap_b32 v0, v1
+; GISEL-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
+; GISEL-NEXT: v_writelane_b32 v40, s4, 0
+; GISEL-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
+; GISEL-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL-NEXT: v_writelane_b32 v40, s30, 1
+; GISEL-NEXT: v_writelane_b32 v40, s31, 2
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s31, v40, 2
+; GISEL-NEXT: v_readlane_b32 s30, v40, 1
+; GISEL-NEXT: v_readlane_b32 s4, v40, 0
+; GISEL-NEXT: v_readlane_b32 s0, v40, 3
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_gfx_from_whole_wave:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_mov_b32 s0, s33
+; DAGISEL64-NEXT: s_mov_b32 s33, s32
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4
+; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL64-NEXT: v_swap_b32 v0, v1
+; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
+; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0
+; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
+; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1
+; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2
+; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3
+; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1
+; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0
+; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL64-NEXT: s_mov_b32 s32, s33
+; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT: s_mov_b32 s33, s0
+; DAGISEL64-NEXT: s_wait_loadcnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_gfx_from_whole_wave:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_mov_b32 s0, s33
+; GISEL64-NEXT: s_mov_b32 s33, s32
+; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164
+; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168
+; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172
+; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176
+; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180
+; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184
+; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188
+; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192
+; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196
+; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200
+; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204
+; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208
+; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212
+; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216
+; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220
+; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224
+; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228
+; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232
+; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236
+; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240
+; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244
+; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248
+; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252
+; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260
+; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264
+; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268
+; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272
+; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276
+; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280
+; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284
+; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288
+; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292
+; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296
+; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300
+; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304
+; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308
+; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312
+; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316
+; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320
+; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324
+; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328
+; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332
+; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336
+; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340
+; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344
+; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348
+; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352
+; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356
+; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360
+; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364
+; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368
+; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372
+; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376
+; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380
+; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388
+; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392
+; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396
+; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400
+; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404
+; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408
+; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412
+; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416
+; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420
+; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424
+; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428
+; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432
+; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436
+; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440
+; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444
+; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448
+; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452
+; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456
+; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460
+; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464
+; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468
+; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472
+; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476
+; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480
+; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484
+; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488
+; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492
+; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496
+; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500
+; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504
+; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508
+; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516
+; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520
+; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524
+; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528
+; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532
+; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536
+; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540
+; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544
+; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548
+; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552
+; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556
+; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560
+; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564
+; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568
+; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572
+; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_writelane_b32 v40, s0, 4
+; GISEL64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL64-NEXT: v_swap_b32 v0, v1
+; GISEL64-NEXT: s_mov_b32 s0, gfx_callee at abs32@lo
+; GISEL64-NEXT: v_writelane_b32 v40, s4, 0
+; GISEL64-NEXT: s_mov_b32 s1, gfx_callee at abs32@hi
+; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT: v_writelane_b32 v40, s5, 1
+; GISEL64-NEXT: v_writelane_b32 v40, s30, 2
+; GISEL64-NEXT: v_writelane_b32 v40, s31, 3
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL64-NEXT: v_readlane_b32 s31, v40, 3
+; GISEL64-NEXT: v_readlane_b32 s30, v40, 2
+; GISEL64-NEXT: v_readlane_b32 s5, v40, 1
+; GISEL64-NEXT: v_readlane_b32 s4, v40, 0
+; GISEL64-NEXT: v_readlane_b32 s0, v40, 4
+; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL64-NEXT: s_mov_b32 s32, s33
+; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164
+; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168
+; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172
+; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176
+; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180
+; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184
+; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188
+; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192
+; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196
+; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200
+; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204
+; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208
+; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212
+; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216
+; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220
+; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224
+; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228
+; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232
+; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236
+; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240
+; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244
+; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248
+; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252
+; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260
+; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264
+; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268
+; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272
+; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276
+; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280
+; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284
+; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288
+; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292
+; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296
+; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300
+; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304
+; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308
+; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312
+; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316
+; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320
+; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324
+; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328
+; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332
+; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336
+; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340
+; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344
+; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348
+; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352
+; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356
+; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360
+; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364
+; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368
+; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372
+; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376
+; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380
+; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388
+; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392
+; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396
+; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400
+; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404
+; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408
+; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412
+; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416
+; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420
+; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424
+; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428
+; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432
+; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436
+; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440
+; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444
+; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448
+; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452
+; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456
+; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460
+; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464
+; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468
+; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472
+; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476
+; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480
+; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484
+; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488
+; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492
+; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496
+; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500
+; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504
+; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508
+; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516
+; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520
+; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524
+; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528
+; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532
+; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536
+; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540
+; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544
+; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548
+; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552
+; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556
+; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560
+; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564
+; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568
+; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572
+; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576
+; GISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT: s_mov_b32 s33, s0
+; GISEL64-NEXT: s_wait_loadcnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
+ ret <2 x half> %ret
+}
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index b514c49394d21..278cf0150c2f7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
@@ -315,6 +316,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
entry:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index fc730f9e88454..890ea44081ce7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -46,6 +46,7 @@
; AFTER-PEI-NEXT: hasInitWholeWave: false
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
+; AFTER-PEI-NEXT: isWholeWaveFunction: false
; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 5adef1433079d..f84ef8a3844dd 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index fa40164aa02f0..cc834d017c149 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -46,6 +46,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
bb0:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 24565e4423d04..06c580ec6f6b4 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -55,6 +55,7 @@
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -162,6 +163,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -240,6 +242,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
@@ -319,6 +322,7 @@ body: |
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
+# FULL-NEXT: isWholeWaveFunction: false
# FULL-NEXT: body:
# SIMPLE: machineFunctionInfo:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index a15271382f37d..427154651a381 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -56,6 +56,7 @@
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
@@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
@@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define void @function() {
ret void
@@ -233,6 +236,7 @@ define void @function() {
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
+; CHECK-NEXT: isWholeWaveFunction: false
; CHECK-NEXT: body:
define void @function_nsz() #0 {
ret void
diff --git a/llvm/test/Verifier/amdgpu-cc.ll b/llvm/test/Verifier/amdgpu-cc.ll
index aec09771d2e4f..e86825e088753 100644
--- a/llvm/test/Verifier/amdgpu-cc.ll
+++ b/llvm/test/Verifier/amdgpu-cc.ll
@@ -217,3 +217,36 @@ define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(p
define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) {
ret void
}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_no_args
+define amdgpu_gfx_whole_wave void @whole_wave_no_args() {
+ ret void
+}
+
+; CHECK: Calling convention requires first argument to be i1
+; CHECK-NEXT: ptr @whole_wave_must_have_i1_active
+define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) {
+ ret void
+}
+
+; CHECK: Calling convention requires first argument to not be inreg
+; CHECK-NEXT: ptr @whole_wave_i1_active_inreg
+define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) {
+ ret void
+}
+
+; CHECK: Calling convention does not support varargs
+; CHECK-NEXT: ptr @whole_wave_varargs
+define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) {
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active)
+
+; CHECK: calling convention does not permit calls
+; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+define amdgpu_cs void @cant_call_whole_wave_func() {
+ call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
+ ret void
+}
More information about the llvm-commits
mailing list