[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 15 00:14:51 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Carl Ritson (perlfu)
<details>
<summary>Changes</summary>
Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.
Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.
---
Patch is 245.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98864.diff
17 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+140-29)
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+57-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+56-72)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+25-41)
- (modified) llvm/test/CodeGen/AMDGPU/cse-convergent.ll (+2-12)
- (modified) llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll (+6-12)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+95-205)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+90-156)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+90-156)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+95-205)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (+110-77)
- (modified) llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll (+2-10)
- (modified) llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll (+1-4)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+12-36)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+110-102)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+220-242)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cc1b9ac0c9ecd..d551a7887e706 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+
+ // Find implicit exec src if this is running in WWM.
+ Register ExecSrcReg = 0;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ bool InWWM = !!ExecSrcReg;
+ bool UseVCndMask = false;
+ if (InWWM) {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses = 1; // Starts at one for ExecRegSrc
+ int LiteralConstants = 0;
+ ConstantBusUses +=
+ usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
+ ConstantBusUses +=
+ usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
+ LiteralConstants +=
+ ActiveSrc.isImm() &&
+ !isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
+ ? 1
+ : 0;
+ LiteralConstants +=
+ InactiveSrc.isImm() &&
+ !isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
+ ? 1
+ : 0;
+ UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
+ LiteralConstants <= LiteralLimit &&
+ (!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // WWM B64; decompose to two B32 operations.
+ // Test above ensures that both sources are registers.
+ // Note: this is done to avoid falling back to V_MOV multiple times
+ // and introducing exec manipulation for each VGPR separately.
+ assert(ActiveSrc.isReg() && InactiveSrc.isReg());
+ Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
+ Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
+ Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
+ Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
+ MachineInstr *Tmp;
+ Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+ RI.getSubReg(DstReg, AMDGPU::sub0))
+ .addReg(InactiveLo)
+ .addReg(ActiveLo)
+ .addReg(ExecSrcReg, RegState::Implicit)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ expandPostRAPseudo(*Tmp);
+ Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+ RI.getSubReg(DstReg, AMDGPU::sub1))
+ .addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
+ .addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
+ .addReg(ExecSrcReg, RegState::Implicit)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ expandPostRAPseudo(*Tmp);
+ } else if (UseVCndMask) {
+ // WWM B32; use V_CNDMASK.
+ MachineInstr *VCndMask =
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .add(InactiveSrc)
+ .addImm(0)
+ .add(ActiveSrc)
+ .addReg(ExecSrcReg);
+ // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
+ for (auto &Op : MI.implicit_operands()) {
+ if (!Op.isDef())
+ continue;
+ VCndMask->addOperand(Op);
+ }
+ } else {
+ // Fallback V_MOV case.
+ // Avoid unnecessary work if a src is the destination.
+ // This can happen if WWM register allocation was efficient.
+ bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+ bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+ if (!SkipActive) {
+ if (InWWM) {
+ // Cancel WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+ }
+ // Copy active lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+ .add(ActiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ if (!SkipInactive) {
+ // Set exec mask to inactive lanes
+ MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+ .addReg(InWWM ? ExecSrcReg : ExecReg);
+ ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ // Copy inactive lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ if (!InWWM) {
+ // Restore original exec mask
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
+ }
+ }
+ if (InWWM) {
+ // Restore WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+ }
+ }
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 19e761ef45b25..e01c045e7ef3d 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;
+ SmallVector<MachineInstr *, 4> SetInactiveInstrs;
void printInfo();
@@ -225,6 +226,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
void lowerInitExec(MachineInstr &MI);
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
+ void harmonizeTransitions();
+
public:
static char ID;
@@ -477,7 +480,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
- SmallVector<MachineInstr *, 4> SetInactiveInstrs;
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
bool HasImplicitDerivatives =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -554,6 +556,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+ // Disable strict states here while marking, relax it later.
III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
@@ -564,6 +567,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
}
}
SetInactiveInstrs.push_back(&MI);
+ GlobalFlags |= StateStrictWWM;
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1037,6 +1042,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
SmallVector<MachineInstr *, 4> SplitPoints;
+ Register ActiveLanesReg = 0;
char State = BI.InitialState;
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1053,6 +1059,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
+ case AMDGPU::ENTER_STRICT_WWM:
+ ActiveLanesReg = MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::EXIT_STRICT_WWM:
+ ActiveLanesReg = 0;
+ break;
+ case AMDGPU::V_SET_INACTIVE_B32:
+ case AMDGPU::V_SET_INACTIVE_B64:
+ if (ActiveLanesReg) {
+ MI.addOperand(*MBB.getParent(),
+ MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ } else
+ assert(State == StateExact || State == StateWQM);
+ break;
default:
break;
}
@@ -1617,6 +1637,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
return InsertPt;
}
+void SIWholeQuadMode::harmonizeTransitions() {
+ // Relax requirements on SET_INACTIVE to allow it in WWM regions.
+ for (MachineInstr *MI : SetInactiveInstrs) {
+ if (MI->getOpcode() == AMDGPU::COPY)
+ continue;
+
+ Instructions[MI].Disabled &= ~StateStrictWWM;
+
+ auto MBB = MI->getParent();
+ auto It = MI->getIterator();
+ if (It == MBB->end())
+ continue;
+
+ bool AddWWM = false;
+ auto NextMI = std::next(It);
+ if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
+ // Groups of SET_INACTIVE are more efficient in WWM.
+ AddWWM = true;
+ } else {
+ // Back propagate WWM needs of next instruction.
+ auto III = Instructions.find(&*NextMI);
+ AddWWM =
+ (III != Instructions.end() && III->second.Needs & StateStrictWWM);
+ }
+
+ if (!AddWWM)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
+ Instructions[MI].Needs |= StateStrictWWM;
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
@@ -1629,6 +1683,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
+ SetInactiveInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1701,6 +1756,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LIS->InsertMachineInstrInMaps(*MI);
lowerKillInstrs(true);
} else {
+ harmonizeTransitions();
for (auto BII : Blocks)
processBlock(*BII.first, BII.first == &Entry);
// Lowering blocks causes block splitting so perform as a second pass.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..8fb4f2cd79a70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -96,15 +96,14 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -117,17 +116,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s7, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -140,15 +137,14 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -160,15 +156,14 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -181,17 +176,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -205,17 +198,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1.0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1.0
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -228,15 +219,14 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -249,17 +239,15 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x10001
-; GCN-NEXT: s_mov...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/98864
More information about the llvm-commits
mailing list