[llvm] Revert "Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)"" (PR #108341)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 12 00:54:43 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Diana Picus (rovka)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->108173
si-init-whole-wave.mir crashes on some buildbots (although it passed both locally with sanitizers enabled and in pre-merge tests). Investigating.
---
Patch is 84.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108341.diff
22 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (-10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (-5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (-10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (-5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-3)
- (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+4-8)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (-10)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (-3)
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+1-29)
- (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (-1127)
- (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (-140)
- (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (-29)
- (removed) llvm/test/CodeGen/AMDGPU/si-init-whole-wave.mir (-133)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll (-2)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll (-1)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (-1)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (-1)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir (-4)
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (-4)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..e20c26eb837875 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
-// Sets the function into whole-wave-mode and returns whether the lane was
-// active when entering the function. A branch depending on this return will
-// revert the EXEC mask to what it was when entering the function, thus
-// resulting in a no-op. This pattern is used to optimize branches when function
-// tails need to be run in whole-wave-mode. It may also have other consequences
-// (mostly related to WWM CSR handling) that differentiate it from using
-// a plain `amdgcn.init.exec -1`.
-def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
- IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
-
def int_amdgcn_wavefrontsize :
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 380dc7d3312f32..0daaf6b6576030 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
- case Intrinsic::amdgcn_init_whole_wave:
- CurDAG->getMachineFunction()
- .getInfo<SIMachineFunctionInfo>()
- ->setInitWholeWave();
- break;
}
SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 53085d423cefb8..4dfd3f087c1ae4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
-bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
- MachineFunction *MF = MI.getParent()->getParent();
- SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
-
- MFInfo->setInitWholeWave();
- return selectImpl(MI, *CoverageInfo);
-}
-
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
- case Intrinsic::amdgcn_init_whole_wave:
- return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..068db5c1c14496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,7 +120,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
- bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34f..7efb7f825348e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,8 +67,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;
- bool HasInitWholeWave = false;
-
public:
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
@@ -111,9 +109,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return WaveLimiter;
}
- bool hasInitWholeWave() const { return HasInitWholeWave; }
- void setInitWholeWave() { HasInitWholeWave = true; }
-
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
return allocateLDSGlobal(DL, GV, DynLDSAlign);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..46d98cad963bc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
- case Intrinsic::amdgcn_init_whole_wave:
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2cd5fb2b94285c..95c4859674ecc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
-def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f860b139945122..55d0de59bc49a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
- if (YamlMFI.HasInitWholeWave)
- MFI->setInitWholeWave();
-
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dfdc7ad32b00c7..8c951105101d96 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
// Allocate spill slots for WWM reserved VGPRs.
// For chain functions, we only need to do this if we have calls to
- // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
- // chain functions do not return) and the function did not contain a call to
- // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
- // when entering the function).
- bool IsChainWithoutRestores =
- FuncInfo->isChainFunction() &&
- (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
- if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
+ // llvm.amdgcn.cs.chain.
+ bool IsChainWithoutCalls =
+ FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
+ if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..284be72886ccef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
let Defs = [EXEC];
}
-// Sets EXEC to all lanes and returns the previous EXEC.
-def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
- (outs SReg_1:$dst), (ins),
- [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
- let Defs = [EXEC];
- let Uses = [EXEC];
-
- let isConvergent = 1;
-}
-
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index aff0b34947d688..4cc60f50978996 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue SGPRForEXECCopy;
StringValue LongBranchReservedReg;
- bool HasInitWholeWave = false;
-
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
@@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue()); // Don't print out when it's empty.
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
- YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
}
};
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ef6c92dfa9b9f2..8cedc34ca40de7 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
- Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
- Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
+ Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();
- if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
- assert(MBB == &MBB->getParent()->front() &&
- "init whole wave not in entry block");
- Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
- MachineInstr *SaveExec =
- BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
- TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
- : AMDGPU::S_OR_SAVEEXEC_B64),
- EntryExec)
- .addImm(-1);
-
- // Replace all uses of MI's destination reg with EntryExec.
- MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
-
- if (LIS) {
- LIS->RemoveMachineInstrFromMaps(MI);
- }
-
- MI.eraseFromParent();
-
- if (LIS) {
- LIS->InsertMachineInstrInMaps(*SaveExec);
- LIS->createAndComputeVirtRegInterval(EntryExec);
- }
- return;
- }
-
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
deleted file mode 100644
index 353f4d90cad1f2..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ /dev/null
@@ -1,1127 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
-
-define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: basic:
-; GISEL12: ; %bb.0: ; %entry
-; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT: s_wait_expcnt 0x0
-; GISEL12-NEXT: s_wait_samplecnt 0x0
-; GISEL12-NEXT: s_wait_bvhcnt 0x0
-; GISEL12-NEXT: s_wait_kmcnt 0x0
-; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT: s_mov_b32 s6, s3
-; GISEL12-NEXT: s_mov_b32 s7, s4
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT: ; %bb.1: ; %shader
-; GISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
-; GISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
-; GISEL12-NEXT: ; %bb.2: ; %tail
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
-; GISEL12-NEXT: s_mov_b32 exec_lo, s5
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: basic:
-; DAGISEL12: ; %bb.0: ; %entry
-; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT: s_wait_expcnt 0x0
-; DAGISEL12-NEXT: s_wait_samplecnt 0x0
-; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT: s_wait_kmcnt 0x0
-; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT: s_mov_b32 s7, s4
-; DAGISEL12-NEXT: s_mov_b32 s6, s3
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT: ; %bb.1: ; %shader
-; DAGISEL12-NEXT: v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL12-NEXT: ; %bb.2: ; %tail
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; DAGISEL12-NEXT: v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: basic:
-; GISEL10: ; %bb.0: ; %entry
-; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT: s_mov_b32 s6, s3
-; GISEL10-NEXT: s_mov_b32 s7, s4
-; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT: ; %bb.1: ; %shader
-; GISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
-; GISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
-; GISEL10-NEXT: ; %bb.2: ; %tail
-; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
-; GISEL10-NEXT: s_mov_b32 exec_lo, s5
-; GISEL10-NEXT: s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: basic:
-; DAGISEL10: ; %bb.0: ; %entry
-; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL10-NEXT: s_or_saveexec_b32 s8, -1
-; DAGISEL10-NEXT: s_mov_b32 s7, s4
-; DAGISEL10-NEXT: s_mov_b32 s6, s3
-; DAGISEL10-NEXT: s_and_saveexec_b32 s3, s8
-; DAGISEL10-NEXT: ; %bb.1: ; %shader
-; DAGISEL10-NEXT: v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL10-NEXT: ; %bb.2: ; %tail
-; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL10-NEXT: v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL10-NEXT: s_mov_b32 exec_lo, s5
-; DAGISEL10-NEXT: s_setpc_b64 s[6:7]
-entry:
- %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
- br i1 %entry_exec, label %shader, label %tail
-
-shader:
- %newx = add i32 %x, 42
- %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
- %newval = add i32 %oldval, 5
- %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
-
- br label %tail
-
-tail:
- %full.x = phi i32 [%x, %entry], [%newx, %shader]
- %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
- %modified.x = add i32 %full.x, 32
- %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
- call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
- unreachable
-}
-
-define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: wwm_in_shader:
-; GISEL12: ; %bb.0: ; %entry
-; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT: s_wait_expcnt 0x0
-; GISEL12-NEXT: s_wait_samplecnt 0x0
-; GISEL12-NEXT: s_wait_bvhcnt 0x0
-; GISEL12-NEXT: s_wait_kmcnt 0x0
-; GISEL12-NEXT: s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
-; GISEL12-NEXT: s_mov_b32 s6, s3
-; GISEL12-NEXT: s_mov_b32 s7, s4
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT: ; %bb.1: ; %shader
-; GISEL12-NEXT: s_or_saveexec_b32 s4, -1
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL12-NEXT: v_mov_b32_e32 v0, s8
-; GISEL12-NEXT: s_mov_b32 exec_lo, s4
-; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
-; GISEL12-NEXT: ; %bb.2: ; %tail
-; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT: s_mov_b32 exec_lo, s5
-; GISEL12-NEXT: s_wait_alu 0xfffe
-; GISEL12-NEXT: s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: wwm_in_shader:
-; DAGISEL12: ; %bb.0: ; %entry
-; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT: s_wait_expcnt 0x0
-; DAGISEL12-NEXT: s_wait_samplecnt 0x0
-; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT: s_wait_kmcnt 0x0
-; DAGISEL12-NEXT: s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
-; DAGISEL12-NEXT: s_mov_b32 s7, s4
-; DAGISEL12-NEXT: s_mov_b32 s6, s3
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT: ; %bb.1: ; %shader
-; DAGISEL12-NEXT: s_or_saveexec_b32 s4, -1
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
-; DAGISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; DAGISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
-; DAGISEL12-NEXT: s_mov_b32 exec_lo, s4
-; DAGISEL12-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
-; DAGISEL12-NEXT: ; %bb.2: ; %tail
-; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT: s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT: s_wait_alu 0xfffe
-; DAGISEL12-NEXT: s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: wwm_in_shader:
-; GISEL10: ; %bb.0: ; %entry
-; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT: s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT: v_mov_b32_e32 v10, v12
-; GISEL10-NEXT: v_mov_b32_e32 v11, v13
-; GISEL10-NEXT: s_mov_b32 s6, s3
-; GISEL10-NEXT: s_mov_b32 s7, s4
-; GISEL10-NEXT: s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT: ; %bb.1: ; %shader
-; GISEL10-NEXT: s_or_saveexec_b32 s4, -1
-; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL10-NEXT: v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL10-NEXT: v_mov_b32_e32 v0, s8
-; GISEL10-NEXT: s_mov_b32 exec_lo, s4
-; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v10
-; GISEL10-NEXT: v_mov_b32_e32 v11, v0
-; GISEL10-NEXT: ; %bb.2: ; %tail
-; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT: s_mov_b32 exec_lo, s5
-; GISEL10-NEXT: s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: wwm_in_shader:
-; DAGISEL10: ; %bb.0: ; %entry
-; DAGISEL10-N...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/108341
More information about the llvm-commits
mailing list