[llvm-branch-commits] [llvm] [AMDGPU] Intrinsic for launching whole wave functions (PR #145859)
Diana Picus via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 27 04:59:19 PDT 2025
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/145859
>From d01ebb0edc697e903aedca36bb0f8bbc55fc8c05 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 24 Jan 2025 10:17:42 +0100
Subject: [PATCH 1/3] [AMDGPU] Intrinsic for launching whole wave functions
Add the llvm.amdgcn.call.whole.wave intrinsic for calling whole wave
functions. This will take as its first argument the callee with the
amdgpu_gfx_whole_wave calling convention, followed by the call
parameters which must match the signature of the callee except for the
first function argument (the i1 original EXEC mask, which doens't need
to be passed in). Indirect calls are not allowed.
Make direct calls to amdgpu_gfx_whole_wave functions a verifier error.
Unspeakable horrors happen around calls from whole wave functions, the
plan is to improve the handling of caller/callee-saved registers in
a future patch.
Tail calls are also handled in a future patch.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 +
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 1 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 37 +
llvm/lib/IR/Verifier.cpp | 30 +
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 19 +-
.../CodeGen/AMDGPU/amdgcn-call-whole-wave.ll | 174 ++
.../irtranslator-whole-wave-functions.ll | 26 +
.../AMDGPU/isel-whole-wave-functions.ll | 76 +
.../CodeGen/AMDGPU/whole-wave-functions.ll | 1424 +++++++++++++++++
.../intrinsic-amdgcn-call-whole-wave.ll | 53 +
10 files changed, 1849 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6f974c97361de..aae210fa60875 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2580,6 +2580,18 @@ def int_amdgcn_cs_chain:
],
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
+// Run a function with all the lanes enabled. Only direct calls are allowed. The
+// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
+// calling convention and must not be variadic. The remaining arguments to the
+// callee are taken from the arguments passed to the intrinsic. Lanes that are
+// inactive at the point of the call will receive poison. The return value is
+// the return value of the callee for the active lanes and poison for the
+// inactive ones.
+def int_amdgcn_call_whole_wave:
+ Intrinsic<[llvm_any_ty], // The return type of the callee.
+ [llvm_anyptr_ty, // The callee.
+ llvm_vararg_ty], // The arguments to the callee.
+ [IntrConvergent, IntrNoReturn, IntrNoCallback, IntrNoFree]>;
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 5d7e07003f10b..159998ebdfaef 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2548,6 +2548,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
return true;
case Intrinsic::amdgcn_cs_chain:
+ case Intrinsic::amdgcn_call_whole_wave:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 04d6fd5f48cc3..2310d511b1df8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7975,6 +7975,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
HasTailCall = true;
return;
}
+ case Intrinsic::amdgcn_call_whole_wave: {
+ TargetLowering::ArgListTy Args;
+
+ // The first argument is the callee. Skip it when assembling the call args.
+ TargetLowering::ArgListEntry Arg;
+ for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
+ Arg.Node = getValue(I.getArgOperand(Idx));
+ Arg.Ty = I.getArgOperand(Idx)->getType();
+ Arg.setAttributes(&I, Idx);
+ Args.push_back(Arg);
+ }
+
+ SDValue ConvControlToken;
+ if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+ auto *Token = Bundle->Inputs[0].get();
+ ConvControlToken = getValue(Token);
+ }
+
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(getCurSDLoc())
+ .setChain(getRoot())
+ .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
+ getValue(I.getArgOperand(0)), std::move(Args))
+ .setTailCall(false)
+ .setIsPreallocated(
+ I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
+ .setConvergent(I.isConvergent())
+ .setConvergenceControlToken(ConvControlToken);
+ CLI.CB = &I;
+
+ std::pair<SDValue, SDValue> Result =
+ lowerInvokable(CLI, /*EHPadBB*/ nullptr);
+
+ if (Result.first.getNode())
+ setValue(&I, Result.first);
+ return;
+ }
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 32ce1880f2fdd..f4ba2ae7e9249 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6535,6 +6535,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"Value for inactive lanes must be a VGPR function argument", &Call);
break;
}
+ case Intrinsic::amdgcn_call_whole_wave: {
+ auto F = dyn_cast<Function>(Call.getArgOperand(0));
+ Check(F, "Indirect whole wave calls are not allowed", &Call);
+
+ CallingConv::ID CC = F->getCallingConv();
+ Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
+ "Callee must have the amdgpu_gfx_whole_wave calling convention",
+ &Call);
+
+ Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
+
+ Check(Call.arg_size() == F->arg_size(),
+ "Call argument count must match callee argument count", &Call);
+
+ // The first argument of the call is the callee, and the first argument of
+ // the callee is the active mask. The rest of the arguments must match.
+ Check(F->arg_begin()->getType()->isIntegerTy(1),
+ "Callee must have i1 as its first argument", &Call);
+ for (auto [CallArg, FuncArg] :
+ drop_begin(zip_equal(Call.args(), F->args()))) {
+ Check(CallArg->getType() == FuncArg.getType(),
+ "Argument types must match", &Call);
+
+ // Check that inreg attributes match between call site and function
+ Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
+ FuncArg.hasInRegAttr(),
+ "Argument inreg attributes must match", &Call);
+ }
+ break;
+ }
case Intrinsic::amdgcn_s_prefetch_data: {
Check(
AMDGPU::isFlatGlobalAddrSpace(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b4ea3c81b3b6e..a704a76502b6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1465,9 +1465,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
- assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
- "Unexpected intrinsic");
- return lowerChainCall(MIRBuilder, Info);
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::amdgcn_cs_chain:
+ return lowerChainCall(MIRBuilder, Info);
+ case Intrinsic::amdgcn_call_whole_wave:
+ Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
+
+ // Get the callee from the original instruction, so it doesn't look like
+ // this is an indirect call.
+ Info.Callee = MachineOperand::CreateGA(
+ static_cast<GlobalValue *>(Info.CB->getOperand(0)), /*Offset=*/0);
+ Info.OrigArgs.erase(Info.OrigArgs.begin());
+ Info.IsVarArg = false;
+ break;
+ default:
+ llvm_unreachable("Unexpected intrinsic call");
+ }
}
if (Info.IsVarArg) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
new file mode 100644
index 0000000000000..eac0767c88d80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
+
+declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
+
+define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
+; DAGISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
+; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
+; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: v_writelane_b32 v42, s0, 2
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v41, s33
+; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
+; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; GISEL-NEXT: v_writelane_b32 v42, s30, 0
+; GISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
+; GISEL-NEXT: s_add_co_i32 s32, s32, 16
+; GISEL-NEXT: v_writelane_b32 v42, s31, 1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: global_store_b32 v[40:41], v0, off
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v41, off, s33
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GISEL-NEXT: v_readlane_b32 s31, v42, 1
+; GISEL-NEXT: v_readlane_b32 s30, v42, 0
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: v_readlane_b32 s0, v42, 2
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %y = add i32 %x, 13
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
+ store i32 %ret, ptr addrspace(1) %ptr
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
+
+define amdgpu_gfx void @ret_void(i32 %x) {
+; DAGISEL-LABEL: ret_void:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
+; DAGISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
+; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
+; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_void:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: v_writelane_b32 v40, s0, 2
+; GISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
+; GISEL-NEXT: s_add_co_i32 s32, s32, 16
+; GISEL-NEXT: v_writelane_b32 v40, s30, 0
+; GISEL-NEXT: v_writelane_b32 v40, s31, 1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s31, v40, 1
+; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: v_readlane_b32 s0, v40, 2
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
index b68786b579dd2..962628257bc0f 100644
--- a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
ret i64 %ret
}
+
+declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
+
+; Make sure we don't pass the first argument (i1).
+define amdgpu_cs void @call(i32 %x, ptr %p) {
+ ; CHECK-LABEL: name: call
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+ ; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+ ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
+ store i32 %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
index 0bd87f493f1ac..4030fbcca63fe 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -188,3 +188,79 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
ret i64 %ret
}
+declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, <8 x i32> %x)
+
+; Make sure we don't pass the first argument (i1).
+define amdgpu_cs void @call(<8 x i32> %x, ptr %p) {
+ ; DAGISEL-LABEL: name: call
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+ ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; DAGISEL-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; DAGISEL-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; DAGISEL-NEXT: $vgpr3 = COPY [[COPY6]]
+ ; DAGISEL-NEXT: $vgpr4 = COPY [[COPY5]]
+ ; DAGISEL-NEXT: $vgpr5 = COPY [[COPY4]]
+ ; DAGISEL-NEXT: $vgpr6 = COPY [[COPY3]]
+ ; DAGISEL-NEXT: $vgpr7 = COPY [[COPY2]]
+ ; DAGISEL-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit-def $vgpr0
+ ; DAGISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; DAGISEL-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-NEXT: FLAT_STORE_DWORD killed [[COPY11]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.p)
+ ; DAGISEL-NEXT: S_ENDPGM 0
+ ;
+ ; GISEL-LABEL: name: call
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: $vgpr0 = COPY [[COPY]]
+ ; GISEL-NEXT: $vgpr1 = COPY [[COPY1]]
+ ; GISEL-NEXT: $vgpr2 = COPY [[COPY2]]
+ ; GISEL-NEXT: $vgpr3 = COPY [[COPY3]]
+ ; GISEL-NEXT: $vgpr4 = COPY [[COPY4]]
+ ; GISEL-NEXT: $vgpr5 = COPY [[COPY5]]
+ ; GISEL-NEXT: $vgpr6 = COPY [[COPY6]]
+ ; GISEL-NEXT: $vgpr7 = COPY [[COPY7]]
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+ ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+ ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GISEL-NEXT: $sgpr30_sgpr31 = SI_CALL [[REG_SEQUENCE1]], @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit-def $vgpr0
+ ; GISEL-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.p)
+ ; GISEL-NEXT: S_ENDPGM 0
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x i32> %x) convergent
+ store i32 %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 53d02925fb1c2..3c8478c5a885b 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -2412,3 +2412,1427 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
%ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
}
+
+declare amdgpu_gfx_whole_wave float @callee(i1 %active, <8 x float> %x)
+
+define amdgpu_cs void @call_from_entry(<8 x float> %x, ptr %p) {
+; DAGISEL-LABEL: call_from_entry:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL-NEXT: s_mov_b32 s32, 0
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_from_entry:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: flat_store_b32 v[40:41], v0
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL64-LABEL: call_from_entry:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL64-NEXT: s_mov_b32 s32, 0
+; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
+; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL64-NEXT: s_endpgm
+;
+; GISEL64-LABEL: call_from_entry:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL64-NEXT: s_mov_b32 s32, 0
+; GISEL64-NEXT: v_mov_b32_e32 v40, v8
+; GISEL64-NEXT: v_mov_b32_e32 v41, v9
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: flat_store_b32 v[40:41], v0
+; GISEL64-NEXT: s_endpgm
+ %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
+ store float %ret, ptr %p
+ ret void
+}
+
+define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> %x, ptr %p) {
+; DAGISEL-LABEL: call_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_clause 0x2
+; DAGISEL-NEXT: scratch_store_b32 off, v42, s33
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_writelane_b32 v42, s0, 3
+; DAGISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; DAGISEL-NEXT: v_writelane_b32 v42, s4, 0
+; DAGISEL-NEXT: v_writelane_b32 v42, s30, 1
+; DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
+; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
+; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
+; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
+; DAGISEL-NEXT: s_clause 0x2
+; DAGISEL-NEXT: scratch_load_b32 v42, off, s33
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_from_whole_wave:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_clause 0x2
+; GISEL-NEXT: scratch_store_b32 off, v42, s33
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; GISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_writelane_b32 v42, s0, 3
+; GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
+; GISEL-NEXT: v_writelane_b32 v42, s4, 0
+; GISEL-NEXT: v_writelane_b32 v42, s30, 1
+; GISEL-NEXT: v_writelane_b32 v42, s31, 2
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: flat_store_b32 v[40:41], v0
+; GISEL-NEXT: v_readlane_b32 s31, v42, 2
+; GISEL-NEXT: v_readlane_b32 s30, v42, 1
+; GISEL-NEXT: v_readlane_b32 s4, v42, 0
+; GISEL-NEXT: v_readlane_b32 s0, v42, 3
+; GISEL-NEXT: s_clause 0x2
+; GISEL-NEXT: scratch_load_b32 v42, off, s33
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; GISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_from_whole_wave:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_mov_b32 s0, s33
+; DAGISEL64-NEXT: s_mov_b32 s33, s32
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_clause 0x2
+; DAGISEL64-NEXT: scratch_store_b32 off, v42, s33
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; DAGISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_writelane_b32 v42, s0, 4
+; DAGISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
+; DAGISEL64-NEXT: v_writelane_b32 v42, s4, 0
+; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
+; DAGISEL64-NEXT: v_writelane_b32 v42, s5, 1
+; DAGISEL64-NEXT: v_writelane_b32 v42, s30, 2
+; DAGISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
+; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1
+; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0
+; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4
+; DAGISEL64-NEXT: s_clause 0x2
+; DAGISEL64-NEXT: scratch_load_b32 v42, off, s33
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; DAGISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; DAGISEL64-NEXT: s_mov_b32 s32, s33
+; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT: s_mov_b32 s33, s0
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_from_whole_wave:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_mov_b32 s0, s33
+; GISEL64-NEXT: s_mov_b32 s33, s32
+; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_clause 0x2
+; GISEL64-NEXT: scratch_store_b32 off, v42, s33
+; GISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; GISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_writelane_b32 v42, s0, 4
+; GISEL64-NEXT: s_mov_b32 s0, callee at abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, callee at abs32@hi
+; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT: v_mov_b32_e32 v40, v8
+; GISEL64-NEXT: v_writelane_b32 v42, s4, 0
+; GISEL64-NEXT: v_mov_b32_e32 v41, v9
+; GISEL64-NEXT: v_writelane_b32 v42, s5, 1
+; GISEL64-NEXT: v_writelane_b32 v42, s30, 2
+; GISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: flat_store_b32 v[40:41], v0
+; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
+; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; GISEL64-NEXT: v_readlane_b32 s5, v42, 1
+; GISEL64-NEXT: v_readlane_b32 s4, v42, 0
+; GISEL64-NEXT: v_readlane_b32 s0, v42, 4
+; GISEL64-NEXT: s_clause 0x2
+; GISEL64-NEXT: scratch_load_b32 v42, off, s33
+; GISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; GISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; GISEL64-NEXT: s_mov_b32 s32, s33
+; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; GISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT: s_mov_b32 s33, s0
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
+ store float %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
new file mode 100644
index 0000000000000..12cc0d513a40b
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
@@ -0,0 +1,53 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+define amdgpu_cs void @indirect(ptr %fn, i32 %x) {
+ ; CHECK: Indirect whole wave calls are not allowed
+ %whatever = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr %fn, i32 %x)
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave void @variadic_callee(i1 %active, i32 %x, ...)
+
+define amdgpu_cs void @variadic(ptr %fn, i32 %x) {
+ ; CHECK: Variadic whole wave calls are not allowed
+ %whatever = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @variadic_callee, i32 %x)
+ ret void
+}
+
+declare amdgpu_gfx void @bad_cc_callee(i1 %active, i32 %x)
+
+define amdgpu_cs void @bad_cc(i32 %x) {
+ ; CHECK: Callee must have the amdgpu_gfx_whole_wave calling convention
+ %whatever = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @bad_cc_callee, i32 %x)
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave i32 @no_i1_callee(i32 %active, i32 %y, i32 %z)
+
+define amdgpu_cs void @no_i1(i32 %x) {
+ ; CHECK: Callee must have i1 as its first argument
+ %whatever = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @no_i1_callee, i32 %x, i32 0)
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 inreg %y)
+
+define amdgpu_cs void @bad_args(i32 %x) {
+ ; CHECK: Call argument count must match callee argument count
+ %whatever.0 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x)
+
+ ; CHECK: Argument types must match
+ %whatever.1 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i64 inreg 0)
+
+ ; CHECK: Argument inreg attributes must match
+ %whatever.2 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 0)
+
+ ret void
+}
+
+define amdgpu_cs void @no_direct_calls_to_whole_wave(i32 %x) {
+ ; CHECK: calling convention does not permit calls
+ %whatever = call amdgpu_gfx_whole_wave i32(i1, i32, i32) @good_callee(i1 poison, i32 %x, i32 inreg %x)
+
+ ret void
+}
>From cf3f8f2f5945a3d750c779db2f851add4295a8c1 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 27 Jun 2025 12:56:56 +0200
Subject: [PATCH 2/3] Remove Verifier check that I moved to previous PR
---
.../Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll | 7 -------
1 file changed, 7 deletions(-)
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
index 12cc0d513a40b..dd656f9277402 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll
@@ -44,10 +44,3 @@ define amdgpu_cs void @bad_args(i32 %x) {
ret void
}
-
-define amdgpu_cs void @no_direct_calls_to_whole_wave(i32 %x) {
- ; CHECK: calling convention does not permit calls
- %whatever = call amdgpu_gfx_whole_wave i32(i1, i32, i32) @good_callee(i1 poison, i32 %x, i32 inreg %x)
-
- ret void
-}
>From a67a2d4a6a05eb887d09ce728c7fa1f81ad4a85c Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 27 Jun 2025 13:14:00 +0200
Subject: [PATCH 3/3] Remove embarrassing cast
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a704a76502b6d..9488cccf8fe5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1474,7 +1474,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Get the callee from the original instruction, so it doesn't look like
// this is an indirect call.
Info.Callee = MachineOperand::CreateGA(
- static_cast<GlobalValue *>(Info.CB->getOperand(0)), /*Offset=*/0);
+ cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
Info.OrigArgs.erase(Info.OrigArgs.begin());
Info.IsVarArg = false;
break;
More information about the llvm-branch-commits
mailing list