[llvm] [AMDGPU] Detect kills in register sets when trying to form V_CMPX instructions. (PR #68293)
Thomas Symalla via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 5 04:37:03 PDT 2023
https://github.com/tsymalla updated https://github.com/llvm/llvm-project/pull/68293
>From 609673d2acad5d96793e73855dba6ac83cbe9c9c Mon Sep 17 00:00:00 2001
From: Thomas Symalla <thomas.symalla at amd.com>
Date: Thu, 5 Oct 2023 11:20:56 +0200
Subject: [PATCH] [AMDGPU] Detect kills in register sets when trying to form
V_CMPX instructions.
During the SIOptimizeExecMasking pass, we try to form V_CMPX instructions by
detecting S_AND_SAVEEXEC and V_MOV instructions. Generally, we require the input
operand of the V_MOV, which is the input operand to the to-be-formed V_CMPX, to
be alive. This is forced by clearing the kill flags on the operand after V_CMPX
has been generated.
However, if we have a kill of a register set that contains said register, this
will not be detected by clearKillFlags.
With this change, possible additional kill-flag candidates will be
detected during the final call to findInstrBackwards and then, the kill
flag will be removed to keep all registers in the set alive.
---
.../Target/AMDGPU/SIOptimizeExecMasking.cpp | 34 ++++++++++++++--
.../vcmp-saveexec-to-vcmpx-set-kill.mir | 39 +++++++++++++++++++
2 files changed, 70 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 04c9a6457944c5f..239bcfe5e33f38e 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
+ SmallVector<MachineOperand *, 1> KillFlagCandidates;
Register isCopyFromExec(const MachineInstr &MI) const;
Register isCopyToExec(const MachineInstr &MI) const;
@@ -49,6 +51,8 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
MachineInstr *findInstrBackwards(MachineInstr &Origin,
std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
+ MachineInstr *Terminator = nullptr,
+ SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
unsigned MaxInstructions = 20) const;
bool optimizeExecSequence();
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
@@ -325,11 +329,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
// the beginning of the BB is reached or Pred evaluates to true - which can be
// an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if one of the
+// target instruction. Breaks prematurely by returning nullptr if one of the
// registers given in NonModifiableRegs is modified by the current instruction.
MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
+ ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
+ SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
+ unsigned MaxInstructions) const {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
@@ -344,6 +350,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
for (MCRegister Reg : NonModifiableRegs) {
if (A->modifiesRegister(Reg, TRI))
return nullptr;
+
+ // Check for kills that appear after the terminator instruction, that
+ // would not be detected by clearKillFlags, since they will cause the
+ // register to be dead at a later place, causing the verifier to fail.
+ // We use the candidates to clear the kill flags later.
+ if (Terminator && KillFlagCandidates && A != Terminator && A->killsRegister(Reg, TRI)) {
+ for (MachineOperand &MO : A->operands()) {
+ if (MO.isReg() && MO.isKill()) {
+ Register Candidate = MO.getReg();
+ if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) {
+ KillFlagCandidates->push_back(&MO);
+ }
+ }
+ }
+ }
}
++CurrentIteration;
@@ -599,6 +620,11 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
if (Src1->isReg())
MRI->clearKillFlags(Src1->getReg());
+ for (MachineOperand *MO : KillFlagCandidates) {
+ if (MO)
+ MO->setIsKill(false);
+ }
+
SaveExecInstr.eraseFromParent();
VCmp.eraseFromParent();
@@ -690,7 +716,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
NonDefRegs.push_back(Src1->getReg());
if (!findInstrBackwards(
- MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+ MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
+ VCmp, &KillFlagCandidates))
return;
if (VCmp)
@@ -777,6 +804,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
OrXors.clear();
SaveExecVCmpMapping.clear();
+ KillFlagCandidates.clear();
static unsigned SearchWindow = 10;
for (MachineBasicBlock &MBB : MF) {
unsigned SearchCount = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
new file mode 100644
index 000000000000000..aef7adcf72b53ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s
+
+---
+
+name: vcmp_saveexec_to_vcmpx_set_kill
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+
+ ; GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_set_kill
+ ; GFX1100: liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+ ; GFX1100-NEXT: {{ $}}
+ ; GFX1100-NEXT: renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+ ; GFX1100-NEXT: renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+ ; GFX1100-NEXT: renamable $sgpr4 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
+ ; GFX1100-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; GFX1100-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; GFX1100-NEXT: renamable $sgpr2 = COPY renamable $sgpr0
+ ; GFX1100-NEXT: renamable $sgpr3 = COPY renamable $sgpr0
+ ; GFX1100-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+ ; GFX1100-NEXT: renamable $sgpr68 = COPY renamable $sgpr66
+ ; GFX1100-NEXT: $sgpr5 = S_MOV_B32 $exec_lo
+ ; GFX1100-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, 0, implicit-def $exec, implicit $exec
+ renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+ renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+ renamable $vcc_lo = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
+ renamable $sgpr4 = V_CMP_NE_U32_e64 0, killed $vgpr0, implicit $exec
+ renamable $sgpr0 = S_MOV_B32 0
+ renamable $sgpr1 = COPY renamable $sgpr0
+ renamable $sgpr2 = COPY renamable $sgpr0
+ renamable $sgpr3 = COPY renamable $sgpr0
+ BUFFER_STORE_DWORDX2_OFFSET_exact killed $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+ renamable $sgpr68 = COPY renamable $sgpr66
+ renamable $sgpr5 = COPY $exec_lo, implicit-def $exec_lo
+ renamable $sgpr6 = S_AND_B32 renamable $sgpr5, killed renamable $vcc_lo, implicit-def dead $scc
+ $exec_lo = S_MOV_B32_term killed renamable $sgpr6
+...
More information about the llvm-commits
mailing list