[llvm] [AMDGPU] Detect renamable kills when trying to form V_CMPX instructions. (PR #68293)

Thomas Symalla via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 5 02:27:13 PDT 2023


https://github.com/tsymalla created https://github.com/llvm/llvm-project/pull/68293

During the SIOptimizeExecMasking pass, we try to form V_CMPX instructions by detecting S_AND_SAVEEXEC and V_MOV instructions. Generally, we require the input operand of the V_MOV, which is the input operand to the to-be-formed V_CMPX, to be alive. This is forced by clearing the kill flags on the operand after V_CMPX has been generated.

However, if we have a kill of a renamable register that contains said register, this will not be detected by clearKillFlags. Since clearing the kill flags will possibly cause the other VGPRs to stay alive as well, we skip forming a V_CMPX instruction in that case.

>From 0af84c87b7f26b2abc8c6f28f1f8867073b53f33 Mon Sep 17 00:00:00 2001
From: Thomas Symalla <thomas.symalla at amd.com>
Date: Thu, 5 Oct 2023 11:20:56 +0200
Subject: [PATCH] [AMDGPU] Detect renamable kills when trying to form V_CMPX
 instructions.

During the SIOptimizeExecMasking pass, we try to form V_CMPX instructions by
detecting S_AND_SAVEEXEC and V_MOV instructions. Generally, we require the input
operand of the V_MOV, which is the input operand to the to-be-formed V_CMPX, to
be alive. This is forced by clearing the kill flags on the operand after V_CMPX
has been generated.

However, if we have a kill of a renamable register that contains said register,
this will not be detected by clearKillFlags. Since clearing the kill flags will
possibly cause the other VGPRs to stay alive as well, we skip forming a V_CMPX
instruction in that case.
---
 .../Target/AMDGPU/SIOptimizeExecMasking.cpp   | 18 +++++++++++--
 .../vcmp-saveexec-to-vcmpx-renamable-kill.mir | 25 +++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-renamable-kill.mir

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 04c9a6457944c5f..e3082f7d8aa1742 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -49,6 +49,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
   MachineInstr *findInstrBackwards(MachineInstr &Origin,
                                    std::function<bool(MachineInstr *)> Pred,
                                    ArrayRef<MCRegister> NonModifiableRegs,
+                                   MachineInstr *Terminator = nullptr,
                                    unsigned MaxInstructions = 20) const;
   bool optimizeExecSequence();
   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
@@ -329,7 +330,8 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
 // registers given in NonModifiableRegs is modified by the current instruction.
 MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
-    ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
+    ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
+    unsigned MaxInstructions) const {
   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
                                       E = Origin.getParent()->rend();
   unsigned CurrentIteration = 0;
@@ -344,6 +346,17 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     for (MCRegister Reg : NonModifiableRegs) {
       if (A->modifiesRegister(Reg, TRI))
         return nullptr;
+
+      // Check for kills that appear after the terminator instruction, that
+      // would not be detected by clearKillFlags, since they will cause the
+      // register to be dead at a later place, causing the verifier to fail.
+      if (Terminator && A != Terminator && A->killsRegister(Reg, TRI)) {
+        for (MachineOperand &MO : A->operands()) {
+          if (MO.isReg() && MO.isKill() && MO.isRenamable() &&
+              TRI->regsOverlap(MO.getReg(), Reg))
+            return nullptr;
+        }
+      }
     }
 
     ++CurrentIteration;
@@ -690,7 +703,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
     NonDefRegs.push_back(Src1->getReg());
 
   if (!findInstrBackwards(
-          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
+          VCmp))
     return;
 
   if (VCmp)
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-renamable-kill.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-renamable-kill.mir
new file mode 100644
index 000000000000000..d4d0cb12f217544
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-renamable-kill.mir
@@ -0,0 +1,25 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s
+
+---
+
+# GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_kill_between
+# GFX1100-NOT: V_CMPX
+name: vcmp_saveexec_to_vcmpx_kill_between
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+
+  liveins: $vgpr0
+  renamable $vgpr1 = COPY $vgpr0, implicit $exec
+  renamable $sgpr0 = S_MOV_B32 0
+  renamable $sgpr1 = COPY renamable $sgpr0
+  renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  renamable $vcc_lo = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec
+  renamable $sgpr2 = COPY renamable $sgpr0
+  renamable $sgpr3 = COPY renamable $sgpr0
+  BUFFER_STORE_DWORDX2_OFFSET_exact killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+  $sgpr0 = S_MOV_B32 $exec_lo
+  $sgpr0 = S_AND_SAVEEXEC_B32 $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  $exec_lo = S_MOV_B32_term killed renamable $sgpr0
+...



More information about the llvm-commits mailing list