[llvm] Revert "[AMDGPU] Intrinsic for launching whole wave functions" (PR #152286)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 6 03:24:00 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Diana Picus (rovka)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->145859 because it broke a HIP test:
```
[34/59] Building CXX object External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o
FAILED: External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o
/home/botworker/bbot/clang-hip-vega20/botworker/clang-hip-vega20/llvm/bin/clang++ -DNDEBUG -O3 -DNDEBUG -w -Werror=date-time --rocm-path=/opt/botworker/llvm/External/hip/rocm-6.3.0 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 --offload-arch=gfx1100 -xhip -mfma -MD -MT External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o -MF External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o.d -o External/HIP/CMakeFiles/TheNextWeek-hip-6.3.0.dir/workload/ray-tracing/TheNextWeek/main.cc.o -c /home/botworker/bbot/clang-hip-vega20/llvm-test-suite/External/HIP/workload/ray-tracing/TheNextWeek/main.cc
fatal error: error in backend: Cannot select: intrinsic %llvm.amdgcn.readfirstlane
```
---
Patch is 105.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152286.diff
10 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (-12)
- (modified) llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp (-1)
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (-37)
- (modified) llvm/lib/IR/Verifier.cpp (-30)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (+3-16)
- (removed) llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll (-174)
- (modified) llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll (-26)
- (modified) llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll (-76)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (-1424)
- (removed) llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-call-whole-wave.ll (-46)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 191ed5f523a74..90cfd8cedd51b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2671,18 +2671,6 @@ def int_amdgcn_cs_chain:
],
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
-// Run a function with all the lanes enabled. Only direct calls are allowed. The
-// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
-// calling convention and must not be variadic. The remaining arguments to the
-// callee are taken from the arguments passed to the intrinsic. Lanes that are
-// inactive at the point of the call will receive poison. The return value is
-// the return value of the callee for the active lanes (there is no return
-// value in the inactive ones).
-def int_amdgcn_call_whole_wave:
- Intrinsic<[llvm_any_ty], // The return type of the callee.
- [llvm_anyptr_ty, // The callee.
- llvm_vararg_ty], // The arguments to the callee.
- [IntrConvergent]>;
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 787543df1f0f0..bbfae570e1e1a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2556,7 +2556,6 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
return true;
case Intrinsic::amdgcn_cs_chain:
- case Intrinsic::amdgcn_call_whole_wave:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d5b904055e547..d0815e9f51822 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7984,43 +7984,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
HasTailCall = true;
return;
}
- case Intrinsic::amdgcn_call_whole_wave: {
- TargetLowering::ArgListTy Args;
-
- // The first argument is the callee. Skip it when assembling the call args.
- TargetLowering::ArgListEntry Arg;
- for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
- Arg.Node = getValue(I.getArgOperand(Idx));
- Arg.Ty = I.getArgOperand(Idx)->getType();
- Arg.setAttributes(&I, Idx);
- Args.push_back(Arg);
- }
-
- SDValue ConvControlToken;
- if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
- auto *Token = Bundle->Inputs[0].get();
- ConvControlToken = getValue(Token);
- }
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(getCurSDLoc())
- .setChain(getRoot())
- .setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
- getValue(I.getArgOperand(0)), std::move(Args))
- .setTailCall(false)
- .setIsPreallocated(
- I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
- .setConvergent(I.isConvergent())
- .setConvergenceControlToken(ConvControlToken);
- CLI.CB = &I;
-
- std::pair<SDValue, SDValue> Result =
- lowerInvokable(CLI, /*EHPadBB=*/nullptr);
-
- if (Result.first.getNode())
- setValue(&I, Result.first);
- return;
- }
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f3f0ae5233977..ca3f148f881a4 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6612,36 +6612,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"Value for inactive lanes must be a VGPR function argument", &Call);
break;
}
- case Intrinsic::amdgcn_call_whole_wave: {
- auto F = dyn_cast<Function>(Call.getArgOperand(0));
- Check(F, "Indirect whole wave calls are not allowed", &Call);
-
- CallingConv::ID CC = F->getCallingConv();
- Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
- "Callee must have the amdgpu_gfx_whole_wave calling convention",
- &Call);
-
- Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
-
- Check(Call.arg_size() == F->arg_size(),
- "Call argument count must match callee argument count", &Call);
-
- // The first argument of the call is the callee, and the first argument of
- // the callee is the active mask. The rest of the arguments must match.
- Check(F->arg_begin()->getType()->isIntegerTy(1),
- "Callee must have i1 as its first argument", &Call);
- for (auto [CallArg, FuncArg] :
- drop_begin(zip_equal(Call.args(), F->args()))) {
- Check(CallArg->getType() == FuncArg.getType(),
- "Argument types must match", &Call);
-
- // Check that inreg attributes match between call site and function
- Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
- FuncArg.hasInRegAttr(),
- "Argument inreg attributes must match", &Call);
- }
- break;
- }
case Intrinsic::amdgcn_s_prefetch_data: {
Check(
AMDGPU::isFlatGlobalAddrSpace(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3ff6e22fbb943..3d8d274f06246 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1464,22 +1464,9 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
- switch (F->getIntrinsicID()) {
- case Intrinsic::amdgcn_cs_chain:
- return lowerChainCall(MIRBuilder, Info);
- case Intrinsic::amdgcn_call_whole_wave:
- Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
-
- // Get the callee from the original instruction, so it doesn't look like
- // this is an indirect call.
- Info.Callee = MachineOperand::CreateGA(
- cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0);
- Info.OrigArgs.erase(Info.OrigArgs.begin());
- Info.IsVarArg = false;
- break;
- default:
- llvm_unreachable("Unexpected intrinsic call");
- }
+ assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
+ "Unexpected intrinsic");
+ return lowerChainCall(MIRBuilder, Info);
}
if (Info.IsVarArg) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
deleted file mode 100644
index eac0767c88d80..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ /dev/null
@@ -1,174 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
-
-declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
-
-define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
-; DAGISEL-LABEL: basic_test:
-; DAGISEL: ; %bb.0:
-; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; DAGISEL-NEXT: s_wait_expcnt 0x0
-; DAGISEL-NEXT: s_wait_samplecnt 0x0
-; DAGISEL-NEXT: s_wait_bvhcnt 0x0
-; DAGISEL-NEXT: s_wait_kmcnt 0x0
-; DAGISEL-NEXT: s_mov_b32 s0, s33
-; DAGISEL-NEXT: s_mov_b32 s33, s32
-; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
-; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
-; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
-; DAGISEL-NEXT: s_clause 0x1
-; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
-; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
-; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
-; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
-; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
-; DAGISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
-; DAGISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
-; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
-; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
-; DAGISEL-NEXT: s_clause 0x1
-; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
-; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
-; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
-; DAGISEL-NEXT: s_mov_b32 s32, s33
-; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
-; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
-; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
-; DAGISEL-NEXT: s_mov_b32 s33, s0
-; DAGISEL-NEXT: s_wait_loadcnt 0x0
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: basic_test:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT: s_wait_expcnt 0x0
-; GISEL-NEXT: s_wait_samplecnt 0x0
-; GISEL-NEXT: s_wait_bvhcnt 0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_mov_b32 s0, s33
-; GISEL-NEXT: s_mov_b32 s33, s32
-; GISEL-NEXT: s_or_saveexec_b32 s1, -1
-; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_mov_b32 exec_lo, s1
-; GISEL-NEXT: v_writelane_b32 v42, s0, 2
-; GISEL-NEXT: s_clause 0x1
-; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
-; GISEL-NEXT: scratch_store_b32 off, v41, s33
-; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
-; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
-; GISEL-NEXT: v_writelane_b32 v42, s30, 0
-; GISEL-NEXT: s_mov_b32 s0, good_callee at abs32@lo
-; GISEL-NEXT: s_mov_b32 s1, good_callee at abs32@hi
-; GISEL-NEXT: s_add_co_i32 s32, s32, 16
-; GISEL-NEXT: v_writelane_b32 v42, s31, 1
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL-NEXT: global_store_b32 v[40:41], v0, off
-; GISEL-NEXT: s_clause 0x1
-; GISEL-NEXT: scratch_load_b32 v41, off, s33
-; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT: v_readlane_b32 s31, v42, 1
-; GISEL-NEXT: v_readlane_b32 s30, v42, 0
-; GISEL-NEXT: s_mov_b32 s32, s33
-; GISEL-NEXT: v_readlane_b32 s0, v42, 2
-; GISEL-NEXT: s_or_saveexec_b32 s1, -1
-; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_mov_b32 exec_lo, s1
-; GISEL-NEXT: s_mov_b32 s33, s0
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %y = add i32 %x, 13
- %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
- store i32 %ret, ptr addrspace(1) %ptr
- ret void
-}
-
-declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
-
-define amdgpu_gfx void @ret_void(i32 %x) {
-; DAGISEL-LABEL: ret_void:
-; DAGISEL: ; %bb.0:
-; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; DAGISEL-NEXT: s_wait_expcnt 0x0
-; DAGISEL-NEXT: s_wait_samplecnt 0x0
-; DAGISEL-NEXT: s_wait_bvhcnt 0x0
-; DAGISEL-NEXT: s_wait_kmcnt 0x0
-; DAGISEL-NEXT: s_mov_b32 s0, s33
-; DAGISEL-NEXT: s_mov_b32 s33, s32
-; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
-; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
-; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
-; DAGISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
-; DAGISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
-; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
-; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
-; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
-; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
-; DAGISEL-NEXT: s_mov_b32 s32, s33
-; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
-; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
-; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
-; DAGISEL-NEXT: s_mov_b32 s33, s0
-; DAGISEL-NEXT: s_wait_loadcnt 0x0
-; DAGISEL-NEXT: s_wait_alu 0xfffe
-; DAGISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: ret_void:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT: s_wait_expcnt 0x0
-; GISEL-NEXT: s_wait_samplecnt 0x0
-; GISEL-NEXT: s_wait_bvhcnt 0x0
-; GISEL-NEXT: s_wait_kmcnt 0x0
-; GISEL-NEXT: s_mov_b32 s0, s33
-; GISEL-NEXT: s_mov_b32 s33, s32
-; GISEL-NEXT: s_or_saveexec_b32 s1, -1
-; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_mov_b32 exec_lo, s1
-; GISEL-NEXT: v_writelane_b32 v40, s0, 2
-; GISEL-NEXT: s_mov_b32 s0, void_callee at abs32@lo
-; GISEL-NEXT: s_mov_b32 s1, void_callee at abs32@hi
-; GISEL-NEXT: s_add_co_i32 s32, s32, 16
-; GISEL-NEXT: v_writelane_b32 v40, s30, 0
-; GISEL-NEXT: v_writelane_b32 v40, s31, 1
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
-; GISEL-NEXT: v_readlane_b32 s30, v40, 0
-; GISEL-NEXT: s_mov_b32 s32, s33
-; GISEL-NEXT: v_readlane_b32 s0, v40, 2
-; GISEL-NEXT: s_or_saveexec_b32 s1, -1
-; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_mov_b32 exec_lo, s1
-; GISEL-NEXT: s_mov_b32 s33, s0
-; GISEL-NEXT: s_wait_loadcnt 0x0
-; GISEL-NEXT: s_wait_alu 0xfffe
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
- ret void
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
index 17c8010bcbe05..8fc5afb155573 100644
--- a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -101,29 +101,3 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
ret i64 %ret
}
-
-declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
-
-; Make sure we don't pass the first argument (i1).
-define amdgpu_cs void @call(i32 %x, ptr %p) {
- ; CHECK-LABEL: name: call
- ; CHECK: bb.1 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
- ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
- ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
- ; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
- ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
- ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
- ; CHECK-NEXT: S_ENDPGM 0
- %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
- store i32 %ret, ptr %p
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
index 69809b115e037..3450d63ff7b4a 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -189,79 +189,3 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
ret i64 %ret
}
-declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, <8 x i32> %x)
-
-; Make sure we don't pass the first argument (i1).
-define amdgpu_cs void @call(<8 x i32> %x, ptr %p) {
- ; DAGISEL-LABEL: name: call
- ; DAGISEL: bb.0 (%ir-block.0):
- ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
- ; DAGISEL-NEXT: {{ $}}
- ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7
- ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; DAGISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
- ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
- ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; DAGISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
- ; DAGISEL-NEXT: $vgpr0 = COPY [[COPY9]]
- ; DAGISEL-NEXT: $vgpr1 = COPY [[COPY8]]
- ; DAGISEL-NEXT: $vgpr2 = COPY [[COPY7]]
- ; DAGISEL-NEXT: $vgpr3 = COPY [[COPY6]]
- ; DAGISEL-NEXT: $vgpr4 = COPY [[COPY5]]
- ; DAGISEL-NEXT: $vgpr5 = COPY [[COPY4]]
- ; DAGISEL-NEXT: $vgpr6 = COPY [[COPY3]]
- ; DAGISEL-NEXT: $vgpr7 = COPY [[COPY2]]
- ; DAGISEL-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit-def $vgpr0
- ; DAGISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
- ; DAGISEL-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; DAGISEL-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; DAGISEL-NEXT: FLAT_STORE_DWORD killed [[COPY11]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.p)
- ; DAGISEL-NEXT: S_ENDPGM 0
- ;
- ; GISEL-LABEL: name: call
- ; GISEL: bb.1 (%ir-block.0):
- ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
- ; GISEL-NEXT: {{ $}}
- ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; GISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7
- ; GISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; GISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr9
- ; GISEL-NEXT: [[REG_SEQUENCE:%[...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152286
More information about the llvm-commits
mailing list