[llvm] [AMDGPU] Detect kills in register sets when trying to form V_CMPX instructions. (PR #68293)

Thomas Symalla via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 5 04:37:03 PDT 2023


https://github.com/tsymalla updated https://github.com/llvm/llvm-project/pull/68293

>From 609673d2acad5d96793e73855dba6ac83cbe9c9c Mon Sep 17 00:00:00 2001
From: Thomas Symalla <thomas.symalla at amd.com>
Date: Thu, 5 Oct 2023 11:20:56 +0200
Subject: [PATCH] [AMDGPU] Detect kills in register sets when trying to form
 V_CMPX instructions.

During the SIOptimizeExecMasking pass, we try to form V_CMPX instructions by
detecting S_AND_SAVEEXEC and V_MOV instructions. Generally, we require the input
operand of the V_MOV, which is the input operand to the to-be-formed V_CMPX, to
be alive. This is forced by clearing the kill flags on the operand after V_CMPX
has been generated.

However, if we have a kill of a register set that contains said register, this
will not be detected by clearKillFlags.
With this change, possible additional kill-flag candidates will be
detected during the final call to findInstrBackwards and then, the kill
flag will be removed to keep all registers in the set alive.
---
 .../Target/AMDGPU/SIOptimizeExecMasking.cpp   | 34 ++++++++++++++--
 .../vcmp-saveexec-to-vcmpx-set-kill.mir       | 39 +++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 04c9a6457944c5f..239bcfe5e33f38e 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
 
   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
+  SmallVector<MachineOperand *, 1> KillFlagCandidates;
 
   Register isCopyFromExec(const MachineInstr &MI) const;
   Register isCopyToExec(const MachineInstr &MI) const;
@@ -49,6 +51,8 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
   MachineInstr *findInstrBackwards(MachineInstr &Origin,
                                    std::function<bool(MachineInstr *)> Pred,
                                    ArrayRef<MCRegister> NonModifiableRegs,
+                                   MachineInstr *Terminator = nullptr,
+                                   SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
                                    unsigned MaxInstructions = 20) const;
   bool optimizeExecSequence();
   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
@@ -325,11 +329,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
 // the beginning of the BB is reached or Pred evaluates to true - which can be
 // an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if  one of the
+// target instruction. Breaks prematurely by returning nullptr if one of the
 // registers given in NonModifiableRegs is modified by the current instruction.
 MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
-    ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
+    ArrayRef<MCRegister> NonModifiableRegs,  MachineInstr *Terminator,
+    SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
+    unsigned MaxInstructions) const {
   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
                                       E = Origin.getParent()->rend();
   unsigned CurrentIteration = 0;
@@ -344,6 +350,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     for (MCRegister Reg : NonModifiableRegs) {
       if (A->modifiesRegister(Reg, TRI))
         return nullptr;
+
+      // Check for kills that appear after the terminator instruction, that
+      // would not be detected by clearKillFlags, since they will cause the
+      // register to be dead at a later place, causing the verifier to fail.
+      // We use the candidates to clear the kill flags later.
+      if (Terminator && KillFlagCandidates && A != Terminator && A->killsRegister(Reg, TRI)) {
+        for (MachineOperand &MO : A->operands()) {
+          if (MO.isReg() && MO.isKill()) {
+            Register Candidate = MO.getReg();
+            if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg)) {
+              KillFlagCandidates->push_back(&MO);
+            }
+          }
+        }
+      }
     }
 
     ++CurrentIteration;
@@ -599,6 +620,11 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
   if (Src1->isReg())
     MRI->clearKillFlags(Src1->getReg());
 
+  for (MachineOperand *MO : KillFlagCandidates) {
+    if (MO)
+      MO->setIsKill(false);
+  }
+
   SaveExecInstr.eraseFromParent();
   VCmp.eraseFromParent();
 
@@ -690,7 +716,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
     NonDefRegs.push_back(Src1->getReg());
 
   if (!findInstrBackwards(
-          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
+          VCmp, &KillFlagCandidates))
     return;
 
   if (VCmp)
@@ -777,6 +804,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
   OrXors.clear();
   SaveExecVCmpMapping.clear();
+  KillFlagCandidates.clear();
   static unsigned SearchWindow = 10;
   for (MachineBasicBlock &MBB : MF) {
     unsigned SearchCount = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
new file mode 100644
index 000000000000000..aef7adcf72b53ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s
+
+---
+
+name: vcmp_saveexec_to_vcmpx_set_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+
+    ; GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_set_kill
+    ; GFX1100: liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+    ; GFX1100-NEXT: renamable $sgpr4 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
+    ; GFX1100-NEXT: renamable $sgpr0 = S_MOV_B32 0
+    ; GFX1100-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr2 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr3 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+    ; GFX1100-NEXT: renamable $sgpr68 = COPY renamable $sgpr66
+    ; GFX1100-NEXT: $sgpr5 = S_MOV_B32 $exec_lo
+    ; GFX1100-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, 0, implicit-def $exec, implicit $exec
+    renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+    renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+    renamable $vcc_lo = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
+    renamable $sgpr4 = V_CMP_NE_U32_e64 0, killed $vgpr0, implicit $exec
+    renamable $sgpr0 = S_MOV_B32 0
+    renamable $sgpr1 = COPY renamable $sgpr0
+    renamable $sgpr2 = COPY renamable $sgpr0
+    renamable $sgpr3 = COPY renamable $sgpr0
+    BUFFER_STORE_DWORDX2_OFFSET_exact killed $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+    renamable $sgpr68 = COPY renamable $sgpr66
+    renamable $sgpr5 = COPY $exec_lo, implicit-def $exec_lo
+    renamable $sgpr6 = S_AND_B32 renamable $sgpr5, killed renamable $vcc_lo, implicit-def dead $scc
+    $exec_lo = S_MOV_B32_term killed renamable $sgpr6
+...



More information about the llvm-commits mailing list